Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
YOLO-World_pytorch
Commits
e9cee049
Commit
e9cee049
authored
May 31, 2024
by
luopl
Browse files
Initial commit
parents
Pipeline
#1056
canceled with stages
Changes
166
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3405 additions
and
0 deletions
+3405
-0
configs/finetune_coco/yolo_world_v2_s_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
...world_v2_s_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
+145
-0
configs/finetune_coco/yolo_world_v2_s_rep_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
..._rep_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
+146
-0
configs/finetune_coco/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
...v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
+184
-0
configs/finetune_coco/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
...v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
+183
-0
configs/finetune_coco/yolo_world_v2_xl_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
...2_xl_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
+173
-0
configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_800ft_lvis_minival.py
...3_100e_4x8gpus_obj365v1_goldg_train_800ft_lvis_minival.py
+200
-0
configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
...bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
+171
-0
configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
..._100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
+202
-0
configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
...bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
+171
-0
configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py
...pan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py
+171
-0
configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
..._100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
+198
-0
configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
...bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
+171
-0
configs/pretrain/yolo_world_v2_m_vlpan_bn_noeinsum_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
...um_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
+176
-0
configs/pretrain/yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
..._100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
+195
-0
configs/pretrain/yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
...bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
+170
-0
configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
..._100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
+199
-0
configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
...bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
+171
-0
configs/pretrain/yolo_world_v2_xl_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
...bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
+185
-0
configs/pretrain_v1/README.md
configs/pretrain_v1/README.md
+22
-0
configs/pretrain_v1/yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
...rm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
+172
-0
No files found.
configs/finetune_coco/yolo_world_v2_s_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
0 → 100644
View file @
e9cee049
_base_
=
(
'../../third_party/mmyolo/configs/yolov8/'
'yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py'
)
custom_imports
=
dict
(
imports
=
[
'yolo_world'
],
allow_failed_imports
=
False
)
# hyper-parameters
num_classes
=
80
num_training_classes
=
80
max_epochs
=
80
# Maximum training epochs
close_mosaic_epochs
=
10
save_epoch_intervals
=
5
text_channels
=
512
neck_embed_channels
=
[
128
,
256
,
_base_
.
last_stage_out_channels
//
2
]
neck_num_heads
=
[
4
,
8
,
_base_
.
last_stage_out_channels
//
2
//
32
]
base_lr
=
2e-4
weight_decay
=
0.05
train_batch_size_per_gpu
=
16
load_from
=
'../FastDet/output_models/pretrain_yolow-v8_s_clipv2_frozen_te_noprompt_t2i_bn_2e-3adamw_scale_lr_wd_32xb16-100e_obj365v1_goldg_cc3mram250k_train_lviseval-e3592307_rep_conv.pth'
persistent_workers
=
False
mixup_prob
=
0.15
copypaste_prob
=
0.3
# model settings
model
=
dict
(
type
=
'SimpleYOLOWorldDetector'
,
mm_neck
=
True
,
num_train_classes
=
num_classes
,
num_test_classes
=
num_classes
,
reparameterized
=
True
,
data_preprocessor
=
dict
(
type
=
'YOLOv5DetDataPreprocessor'
),
backbone
=
dict
(
_delete_
=
True
,
type
=
'MultiModalYOLOBackbone'
,
text_model
=
None
,
image_model
=
{{
_base_
.
model
.
backbone
}},
with_text_model
=
False
),
neck
=
dict
(
type
=
'YOLOWorldPAFPN'
,
guide_channels
=
text_channels
,
embed_channels
=
neck_embed_channels
,
num_heads
=
neck_num_heads
,
block_cfg
=
dict
(
type
=
'EfficientCSPLayerWithTwoConv'
)),
bbox_head
=
dict
(
head_module
=
dict
(
type
=
'RepYOLOWorldHeadModule'
,
embed_dims
=
text_channels
,
num_guide
=
num_classes
,
num_classes
=
num_classes
)),
train_cfg
=
dict
(
assigner
=
dict
(
num_classes
=
num_classes
)))
# dataset settings
final_transform
=
[
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'flip'
,
'flip_direction'
))
]
mosaic_affine_transform
=
[
dict
(
type
=
'Mosaic'
,
img_scale
=
_base_
.
img_scale
,
pad_val
=
114.0
,
pre_transform
=
_base_
.
pre_transform
),
dict
(
type
=
'YOLOv5CopyPaste'
,
prob
=
copypaste_prob
),
dict
(
type
=
'YOLOv5RandomAffine'
,
max_rotate_degree
=
0.0
,
max_shear_degree
=
0.0
,
max_aspect_ratio
=
100.
,
scaling_ratio_range
=
(
1
-
_base_
.
affine_scale
,
1
+
_base_
.
affine_scale
),
# img_scale is (width, height)
border
=
(
-
_base_
.
img_scale
[
0
]
//
2
,
-
_base_
.
img_scale
[
1
]
//
2
),
border_val
=
(
114
,
114
,
114
),
min_area_ratio
=
_base_
.
min_area_ratio
,
use_mask_refine
=
_base_
.
use_mask2refine
)
]
train_pipeline
=
[
*
_base_
.
pre_transform
,
*
mosaic_affine_transform
,
dict
(
type
=
'YOLOv5MixUp'
,
prob
=
mixup_prob
,
pre_transform
=
[
*
_base_
.
pre_transform
,
*
mosaic_affine_transform
]),
*
_base_
.
last_transform
[:
-
1
],
*
final_transform
]
train_pipeline_stage2
=
[
*
_base_
.
train_pipeline_stage2
[:
-
1
],
*
final_transform
]
coco_train_dataset
=
dict
(
type
=
'YOLOv5CocoDataset'
,
data_root
=
'data/coco'
,
ann_file
=
'annotations/instances_train2017.json'
,
data_prefix
=
dict
(
img
=
'train2017/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
),
pipeline
=
train_pipeline
)
train_dataloader
=
dict
(
persistent_workers
=
persistent_workers
,
batch_size
=
train_batch_size_per_gpu
,
collate_fn
=
dict
(
type
=
'yolow_collate'
),
dataset
=
coco_train_dataset
)
train_dataloader
=
dict
(
persistent_workers
=
persistent_workers
,
batch_size
=
train_batch_size_per_gpu
,
collate_fn
=
dict
(
type
=
'yolow_collate'
),
dataset
=
coco_train_dataset
)
test_pipeline
=
[
*
_base_
.
test_pipeline
[:
-
1
],
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'scale_factor'
,
'pad_param'
))
]
coco_val_dataset
=
dict
(
type
=
'YOLOv5CocoDataset'
,
data_root
=
'data/coco'
,
ann_file
=
'annotations/instances_val2017.json'
,
data_prefix
=
dict
(
img
=
'val2017/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
),
pipeline
=
test_pipeline
)
val_dataloader
=
dict
(
dataset
=
coco_val_dataset
)
test_dataloader
=
val_dataloader
# training settings
default_hooks
=
dict
(
param_scheduler
=
dict
(
scheduler_type
=
'linear'
,
lr_factor
=
0.01
,
max_epochs
=
max_epochs
),
checkpoint
=
dict
(
max_keep_ckpts
=-
1
,
save_best
=
None
,
interval
=
save_epoch_intervals
))
custom_hooks
=
[
dict
(
type
=
'EMAHook'
,
ema_type
=
'ExpMomentumEMA'
,
momentum
=
0.0001
,
update_buffers
=
True
,
strict_load
=
False
,
priority
=
49
),
dict
(
type
=
'mmdet.PipelineSwitchHook'
,
switch_epoch
=
max_epochs
-
close_mosaic_epochs
,
switch_pipeline
=
train_pipeline_stage2
)
]
train_cfg
=
dict
(
max_epochs
=
max_epochs
,
val_interval
=
5
,
dynamic_intervals
=
[((
max_epochs
-
close_mosaic_epochs
),
_base_
.
val_interval_stage2
)])
optim_wrapper
=
dict
(
optimizer
=
dict
(
_delete_
=
True
,
type
=
'AdamW'
,
lr
=
base_lr
,
weight_decay
=
weight_decay
,
batch_size_per_gpu
=
train_batch_size_per_gpu
),
constructor
=
'YOLOWv5OptimizerConstructor'
)
# evaluation settings
val_evaluator
=
dict
(
_delete_
=
True
,
type
=
'mmdet.CocoMetric'
,
proposal_nums
=
(
100
,
1
,
10
),
ann_file
=
'data/coco/annotations/instances_val2017.json'
,
metric
=
'bbox'
)
configs/finetune_coco/yolo_world_v2_s_rep_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
0 → 100644
View file @
e9cee049
_base_
=
(
'../../third_party/mmyolo/configs/yolov8/'
'yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py'
)
custom_imports
=
dict
(
imports
=
[
'yolo_world'
],
allow_failed_imports
=
False
)
# hyper-parameters
num_classes
=
80
num_training_classes
=
80
max_epochs
=
80
# Maximum training epochs
close_mosaic_epochs
=
10
save_epoch_intervals
=
5
text_channels
=
512
neck_embed_channels
=
[
128
,
256
,
_base_
.
last_stage_out_channels
//
2
]
neck_num_heads
=
[
4
,
8
,
_base_
.
last_stage_out_channels
//
2
//
32
]
base_lr
=
2e-4
weight_decay
=
0.05
train_batch_size_per_gpu
=
16
load_from
=
'../FastDet/output_models/yolo_world_s_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_train-55b943ea_rep_conv.pth'
persistent_workers
=
False
mixup_prob
=
0.15
copypaste_prob
=
0.3
# model settings
model
=
dict
(
type
=
'SimpleYOLOWorldDetector'
,
mm_neck
=
True
,
num_train_classes
=
num_classes
,
num_test_classes
=
num_classes
,
reparameterized
=
True
,
data_preprocessor
=
dict
(
type
=
'YOLOv5DetDataPreprocessor'
),
backbone
=
dict
(
_delete_
=
True
,
type
=
'MultiModalYOLOBackbone'
,
text_model
=
None
,
image_model
=
{{
_base_
.
model
.
backbone
}},
with_text_model
=
False
),
neck
=
dict
(
type
=
'YOLOWorldPAFPN'
,
guide_channels
=
num_classes
,
embed_channels
=
neck_embed_channels
,
num_heads
=
neck_num_heads
,
block_cfg
=
dict
(
type
=
'RepConvMaxSigmoidCSPLayerWithTwoConv'
,
guide_channels
=
num_classes
)),
bbox_head
=
dict
(
head_module
=
dict
(
type
=
'RepYOLOWorldHeadModule'
,
embed_dims
=
text_channels
,
num_guide
=
num_classes
,
num_classes
=
num_classes
)),
train_cfg
=
dict
(
assigner
=
dict
(
num_classes
=
num_classes
)))
# dataset settings
final_transform
=
[
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'flip'
,
'flip_direction'
))
]
mosaic_affine_transform
=
[
dict
(
type
=
'Mosaic'
,
img_scale
=
_base_
.
img_scale
,
pad_val
=
114.0
,
pre_transform
=
_base_
.
pre_transform
),
dict
(
type
=
'YOLOv5CopyPaste'
,
prob
=
copypaste_prob
),
dict
(
type
=
'YOLOv5RandomAffine'
,
max_rotate_degree
=
0.0
,
max_shear_degree
=
0.0
,
max_aspect_ratio
=
100.
,
scaling_ratio_range
=
(
1
-
_base_
.
affine_scale
,
1
+
_base_
.
affine_scale
),
# img_scale is (width, height)
border
=
(
-
_base_
.
img_scale
[
0
]
//
2
,
-
_base_
.
img_scale
[
1
]
//
2
),
border_val
=
(
114
,
114
,
114
),
min_area_ratio
=
_base_
.
min_area_ratio
,
use_mask_refine
=
_base_
.
use_mask2refine
)
]
train_pipeline
=
[
*
_base_
.
pre_transform
,
*
mosaic_affine_transform
,
dict
(
type
=
'YOLOv5MixUp'
,
prob
=
mixup_prob
,
pre_transform
=
[
*
_base_
.
pre_transform
,
*
mosaic_affine_transform
]),
*
_base_
.
last_transform
[:
-
1
],
*
final_transform
]
train_pipeline_stage2
=
[
*
_base_
.
train_pipeline_stage2
[:
-
1
],
*
final_transform
]
coco_train_dataset
=
dict
(
type
=
'YOLOv5CocoDataset'
,
data_root
=
'data/coco'
,
ann_file
=
'annotations/instances_train2017.json'
,
data_prefix
=
dict
(
img
=
'train2017/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
),
pipeline
=
train_pipeline
)
train_dataloader
=
dict
(
persistent_workers
=
persistent_workers
,
batch_size
=
train_batch_size_per_gpu
,
collate_fn
=
dict
(
type
=
'yolow_collate'
),
dataset
=
coco_train_dataset
)
train_dataloader
=
dict
(
persistent_workers
=
persistent_workers
,
batch_size
=
train_batch_size_per_gpu
,
collate_fn
=
dict
(
type
=
'yolow_collate'
),
dataset
=
coco_train_dataset
)
test_pipeline
=
[
*
_base_
.
test_pipeline
[:
-
1
],
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'scale_factor'
,
'pad_param'
))
]
coco_val_dataset
=
dict
(
type
=
'YOLOv5CocoDataset'
,
data_root
=
'data/coco'
,
ann_file
=
'annotations/instances_val2017.json'
,
data_prefix
=
dict
(
img
=
'val2017/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
),
pipeline
=
test_pipeline
)
val_dataloader
=
dict
(
dataset
=
coco_val_dataset
)
test_dataloader
=
val_dataloader
# training settings
default_hooks
=
dict
(
param_scheduler
=
dict
(
scheduler_type
=
'linear'
,
lr_factor
=
0.01
,
max_epochs
=
max_epochs
),
checkpoint
=
dict
(
max_keep_ckpts
=-
1
,
save_best
=
None
,
interval
=
save_epoch_intervals
))
custom_hooks
=
[
dict
(
type
=
'EMAHook'
,
ema_type
=
'ExpMomentumEMA'
,
momentum
=
0.0001
,
update_buffers
=
True
,
strict_load
=
False
,
priority
=
49
),
dict
(
type
=
'mmdet.PipelineSwitchHook'
,
switch_epoch
=
max_epochs
-
close_mosaic_epochs
,
switch_pipeline
=
train_pipeline_stage2
)
]
train_cfg
=
dict
(
max_epochs
=
max_epochs
,
val_interval
=
5
,
dynamic_intervals
=
[((
max_epochs
-
close_mosaic_epochs
),
_base_
.
val_interval_stage2
)])
optim_wrapper
=
dict
(
optimizer
=
dict
(
_delete_
=
True
,
type
=
'AdamW'
,
lr
=
base_lr
,
weight_decay
=
weight_decay
,
batch_size_per_gpu
=
train_batch_size_per_gpu
),
constructor
=
'YOLOWv5OptimizerConstructor'
)
# evaluation settings
val_evaluator
=
dict
(
_delete_
=
True
,
type
=
'mmdet.CocoMetric'
,
proposal_nums
=
(
100
,
1
,
10
),
ann_file
=
'data/coco/annotations/instances_val2017.json'
,
metric
=
'bbox'
)
configs/finetune_coco/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
0 → 100644
View file @
e9cee049
_base_
=
(
'../../third_party/mmyolo/configs/yolov8/'
'yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py'
)
custom_imports
=
dict
(
imports
=
[
'yolo_world'
],
allow_failed_imports
=
False
)
# hyper-parameters
num_classes
=
80
num_training_classes
=
80
max_epochs
=
80
# Maximum training epochs
close_mosaic_epochs
=
10
save_epoch_intervals
=
5
text_channels
=
512
neck_embed_channels
=
[
128
,
256
,
_base_
.
last_stage_out_channels
//
2
]
neck_num_heads
=
[
4
,
8
,
_base_
.
last_stage_out_channels
//
2
//
32
]
base_lr
=
2e-4
weight_decay
=
0.05
train_batch_size_per_gpu
=
16
load_from
=
'pretrained_models/yolo_world_s_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_train-55b943ea.pth'
# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
text_model_name
=
'openai/clip-vit-base-patch32'
persistent_workers
=
False
mixup_prob
=
0.15
copypaste_prob
=
0.3
# model settings
model
=
dict
(
type
=
'YOLOWorldDetector'
,
mm_neck
=
True
,
num_train_classes
=
num_training_classes
,
num_test_classes
=
num_classes
,
data_preprocessor
=
dict
(
type
=
'YOLOWDetDataPreprocessor'
),
backbone
=
dict
(
_delete_
=
True
,
type
=
'MultiModalYOLOBackbone'
,
image_model
=
{{
_base_
.
model
.
backbone
}},
text_model
=
dict
(
type
=
'HuggingCLIPLanguageBackbone'
,
model_name
=
text_model_name
,
frozen_modules
=
[
'all'
])),
neck
=
dict
(
type
=
'YOLOWorldPAFPN'
,
guide_channels
=
text_channels
,
embed_channels
=
neck_embed_channels
,
num_heads
=
neck_num_heads
,
block_cfg
=
dict
(
type
=
'MaxSigmoidCSPLayerWithTwoConv'
)),
bbox_head
=
dict
(
type
=
'YOLOWorldHead'
,
head_module
=
dict
(
type
=
'YOLOWorldHeadModule'
,
use_bn_head
=
True
,
embed_dims
=
text_channels
,
num_classes
=
num_training_classes
)),
train_cfg
=
dict
(
assigner
=
dict
(
num_classes
=
num_training_classes
)))
# dataset settings
text_transform
=
[
dict
(
type
=
'RandomLoadText'
,
num_neg_samples
=
(
num_classes
,
num_classes
),
max_num_samples
=
num_training_classes
,
padding_to_max
=
True
,
padding_value
=
''
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'flip'
,
'flip_direction'
,
'texts'
))
]
mosaic_affine_transform
=
[
dict
(
type
=
'MultiModalMosaic'
,
img_scale
=
_base_
.
img_scale
,
pad_val
=
114.0
,
pre_transform
=
_base_
.
pre_transform
),
dict
(
type
=
'YOLOv5CopyPaste'
,
prob
=
copypaste_prob
),
dict
(
type
=
'YOLOv5RandomAffine'
,
max_rotate_degree
=
0.0
,
max_shear_degree
=
0.0
,
max_aspect_ratio
=
100.
,
scaling_ratio_range
=
(
1
-
_base_
.
affine_scale
,
1
+
_base_
.
affine_scale
),
# img_scale is (width, height)
border
=
(
-
_base_
.
img_scale
[
0
]
//
2
,
-
_base_
.
img_scale
[
1
]
//
2
),
border_val
=
(
114
,
114
,
114
),
min_area_ratio
=
_base_
.
min_area_ratio
,
use_mask_refine
=
_base_
.
use_mask2refine
)
]
train_pipeline
=
[
*
_base_
.
pre_transform
,
*
mosaic_affine_transform
,
dict
(
type
=
'YOLOv5MultiModalMixUp'
,
prob
=
mixup_prob
,
pre_transform
=
[
*
_base_
.
pre_transform
,
*
mosaic_affine_transform
]),
*
_base_
.
last_transform
[:
-
1
],
*
text_transform
]
train_pipeline_stage2
=
[
*
_base_
.
train_pipeline_stage2
[:
-
1
],
*
text_transform
]
coco_train_dataset
=
dict
(
_delete_
=
True
,
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5CocoDataset'
,
data_root
=
'data/coco'
,
ann_file
=
'annotations/instances_train2017.json'
,
data_prefix
=
dict
(
img
=
'train2017/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
)),
class_text_path
=
'data/texts/coco_class_texts.json'
,
pipeline
=
train_pipeline
)
train_dataloader
=
dict
(
persistent_workers
=
persistent_workers
,
batch_size
=
train_batch_size_per_gpu
,
collate_fn
=
dict
(
type
=
'yolow_collate'
),
dataset
=
coco_train_dataset
)
test_pipeline
=
[
*
_base_
.
test_pipeline
[:
-
1
],
dict
(
type
=
'LoadText'
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'scale_factor'
,
'pad_param'
,
'texts'
))
]
coco_val_dataset
=
dict
(
_delete_
=
True
,
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5CocoDataset'
,
data_root
=
'data/coco'
,
ann_file
=
'annotations/instances_val2017.json'
,
data_prefix
=
dict
(
img
=
'val2017/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
)),
class_text_path
=
'data/texts/coco_class_texts.json'
,
pipeline
=
test_pipeline
)
val_dataloader
=
dict
(
dataset
=
coco_val_dataset
)
test_dataloader
=
val_dataloader
# training settings
default_hooks
=
dict
(
param_scheduler
=
dict
(
scheduler_type
=
'linear'
,
lr_factor
=
0.01
,
max_epochs
=
max_epochs
),
checkpoint
=
dict
(
max_keep_ckpts
=-
1
,
save_best
=
None
,
interval
=
save_epoch_intervals
))
custom_hooks
=
[
dict
(
type
=
'EMAHook'
,
ema_type
=
'ExpMomentumEMA'
,
momentum
=
0.0001
,
update_buffers
=
True
,
strict_load
=
False
,
priority
=
49
),
dict
(
type
=
'mmdet.PipelineSwitchHook'
,
switch_epoch
=
max_epochs
-
close_mosaic_epochs
,
switch_pipeline
=
train_pipeline_stage2
)
]
train_cfg
=
dict
(
max_epochs
=
max_epochs
,
val_interval
=
5
,
dynamic_intervals
=
[((
max_epochs
-
close_mosaic_epochs
),
_base_
.
val_interval_stage2
)])
optim_wrapper
=
dict
(
optimizer
=
dict
(
_delete_
=
True
,
type
=
'AdamW'
,
lr
=
base_lr
,
weight_decay
=
weight_decay
,
batch_size_per_gpu
=
train_batch_size_per_gpu
),
paramwise_cfg
=
dict
(
custom_keys
=
{
'backbone.text_model'
:
dict
(
lr_mult
=
0.01
),
'logit_scale'
:
dict
(
weight_decay
=
0.0
)}),
constructor
=
'YOLOWv5OptimizerConstructor'
)
# evaluation settings
val_evaluator
=
dict
(
_delete_
=
True
,
type
=
'mmdet.CocoMetric'
,
proposal_nums
=
(
100
,
1
,
10
),
ann_file
=
'data/coco/annotations/instances_val2017.json'
,
metric
=
'bbox'
)
configs/finetune_coco/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
0 → 100644
View file @
e9cee049
_base_
=
(
'../../third_party/mmyolo/configs/yolov8/'
'yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco.py'
)
custom_imports
=
dict
(
imports
=
[
'yolo_world'
],
allow_failed_imports
=
False
)
# hyper-parameters
num_classes
=
80
num_training_classes
=
80
max_epochs
=
80
# Maximum training epochs
close_mosaic_epochs
=
10
save_epoch_intervals
=
5
text_channels
=
512
neck_embed_channels
=
[
128
,
256
,
_base_
.
last_stage_out_channels
//
2
]
neck_num_heads
=
[
4
,
8
,
_base_
.
last_stage_out_channels
//
2
//
32
]
base_lr
=
2e-4
weight_decay
=
0.05
train_batch_size_per_gpu
=
16
load_from
=
'pretrained_models/yolo_world_x_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc250k_train_lviseval-8698fbfa.pth'
text_model_name
=
'../pretrained_models/clip-vit-base-patch32-projection'
# text_model_name = 'openai/clip-vit-base-patch32'
persistent_workers
=
False
# model settings
model
=
dict
(
type
=
'YOLOWorldDetector'
,
mm_neck
=
True
,
num_train_classes
=
num_training_classes
,
num_test_classes
=
num_classes
,
data_preprocessor
=
dict
(
type
=
'YOLOWDetDataPreprocessor'
),
backbone
=
dict
(
_delete_
=
True
,
type
=
'MultiModalYOLOBackbone'
,
image_model
=
{{
_base_
.
model
.
backbone
}},
text_model
=
dict
(
type
=
'HuggingCLIPLanguageBackbone'
,
model_name
=
text_model_name
,
frozen_modules
=
[
'all'
])),
neck
=
dict
(
type
=
'YOLOWorldPAFPN'
,
guide_channels
=
text_channels
,
embed_channels
=
neck_embed_channels
,
num_heads
=
neck_num_heads
,
block_cfg
=
dict
(
type
=
'MaxSigmoidCSPLayerWithTwoConv'
)),
bbox_head
=
dict
(
type
=
'YOLOWorldHead'
,
head_module
=
dict
(
type
=
'YOLOWorldHeadModule'
,
use_bn_head
=
True
,
embed_dims
=
text_channels
,
num_classes
=
num_training_classes
)),
train_cfg
=
dict
(
assigner
=
dict
(
num_classes
=
num_training_classes
)))
# dataset settings
text_transform
=
[
dict
(
type
=
'RandomLoadText'
,
num_neg_samples
=
(
num_classes
,
num_classes
),
max_num_samples
=
num_training_classes
,
padding_to_max
=
True
,
padding_value
=
''
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'flip'
,
'flip_direction'
,
'texts'
))
]
mosaic_affine_transform
=
[
dict
(
type
=
'MultiModalMosaic'
,
img_scale
=
_base_
.
img_scale
,
pad_val
=
114.0
,
pre_transform
=
_base_
.
pre_transform
),
dict
(
type
=
'YOLOv5CopyPaste'
,
prob
=
_base_
.
copypaste_prob
),
dict
(
type
=
'YOLOv5RandomAffine'
,
max_rotate_degree
=
0.0
,
max_shear_degree
=
0.0
,
max_aspect_ratio
=
100.
,
scaling_ratio_range
=
(
1
-
_base_
.
affine_scale
,
1
+
_base_
.
affine_scale
),
# img_scale is (width, height)
border
=
(
-
_base_
.
img_scale
[
0
]
//
2
,
-
_base_
.
img_scale
[
1
]
//
2
),
border_val
=
(
114
,
114
,
114
),
min_area_ratio
=
_base_
.
min_area_ratio
,
use_mask_refine
=
_base_
.
use_mask2refine
)
]
train_pipeline
=
[
*
_base_
.
pre_transform
,
*
mosaic_affine_transform
,
dict
(
type
=
'YOLOv5MultiModalMixUp'
,
prob
=
_base_
.
mixup_prob
,
pre_transform
=
[
*
_base_
.
pre_transform
,
*
mosaic_affine_transform
]),
*
_base_
.
last_transform
[:
-
1
],
*
text_transform
]
train_pipeline_stage2
=
[
*
_base_
.
train_pipeline_stage2
[:
-
1
],
*
text_transform
]
coco_train_dataset
=
dict
(
_delete_
=
True
,
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5CocoDataset'
,
data_root
=
'data/coco'
,
ann_file
=
'annotations/instances_train2017.json'
,
data_prefix
=
dict
(
img
=
'train2017/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
)),
class_text_path
=
'data/texts/coco_class_texts.json'
,
pipeline
=
train_pipeline
)
train_dataloader
=
dict
(
persistent_workers
=
persistent_workers
,
batch_size
=
train_batch_size_per_gpu
,
collate_fn
=
dict
(
type
=
'yolow_collate'
),
dataset
=
coco_train_dataset
)
test_pipeline
=
[
*
_base_
.
test_pipeline
[:
-
1
],
dict
(
type
=
'LoadText'
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'scale_factor'
,
'pad_param'
,
'texts'
))
]
coco_val_dataset
=
dict
(
_delete_
=
True
,
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5CocoDataset'
,
data_root
=
'data/coco'
,
ann_file
=
'annotations/instances_val2017.json'
,
data_prefix
=
dict
(
img
=
'val2017/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
)),
class_text_path
=
'data/texts/coco_class_texts.json'
,
pipeline
=
test_pipeline
)
val_dataloader
=
dict
(
dataset
=
coco_val_dataset
)
test_dataloader
=
val_dataloader
# training settings
default_hooks
=
dict
(
param_scheduler
=
dict
(
scheduler_type
=
'linear'
,
lr_factor
=
0.01
,
max_epochs
=
max_epochs
),
checkpoint
=
dict
(
max_keep_ckpts
=-
1
,
save_best
=
None
,
interval
=
save_epoch_intervals
))
custom_hooks
=
[
dict
(
type
=
'EMAHook'
,
ema_type
=
'ExpMomentumEMA'
,
momentum
=
0.0001
,
update_buffers
=
True
,
strict_load
=
False
,
priority
=
49
),
dict
(
type
=
'mmdet.PipelineSwitchHook'
,
switch_epoch
=
max_epochs
-
close_mosaic_epochs
,
switch_pipeline
=
train_pipeline_stage2
)
]
train_cfg
=
dict
(
max_epochs
=
max_epochs
,
val_interval
=
5
,
dynamic_intervals
=
[((
max_epochs
-
close_mosaic_epochs
),
_base_
.
val_interval_stage2
)])
optim_wrapper
=
dict
(
optimizer
=
dict
(
_delete_
=
True
,
type
=
'AdamW'
,
lr
=
base_lr
,
weight_decay
=
weight_decay
,
batch_size_per_gpu
=
train_batch_size_per_gpu
),
paramwise_cfg
=
dict
(
custom_keys
=
{
'backbone.text_model'
:
dict
(
lr_mult
=
0.01
),
'logit_scale'
:
dict
(
weight_decay
=
0.0
)}),
constructor
=
'YOLOWv5OptimizerConstructor'
)
# evaluation settings
val_evaluator
=
dict
(
_delete_
=
True
,
type
=
'mmdet.CocoMetric'
,
proposal_nums
=
(
100
,
1
,
10
),
ann_file
=
'data/coco/annotations/instances_val2017.json'
,
metric
=
'bbox'
)
configs/finetune_coco/yolo_world_v2_xl_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
0 → 100644
View file @
e9cee049
_base_
=
(
'../../third_party/mmyolo/configs/yolov8/'
'yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco.py'
)
custom_imports
=
dict
(
imports
=
[
'yolo_world'
],
allow_failed_imports
=
False
)
# hyper-parameters
num_classes
=
80
num_training_classes
=
80
max_epochs
=
80
# Maximum training epochs
close_mosaic_epochs
=
10
save_epoch_intervals
=
5
text_channels
=
512
neck_embed_channels
=
[
128
,
256
,
_base_
.
last_stage_out_channels
//
2
]
neck_num_heads
=
[
4
,
8
,
_base_
.
last_stage_out_channels
//
2
//
32
]
base_lr
=
2e-4
weight_decay
=
0.05
train_batch_size_per_gpu
=
16
text_model_name
=
'../pretrained_models/clip-vit-base-patch32-projection'
# text_model_name = 'openai/clip-vit-base-patch32'
persistent_workers
=
False
# scaling model from X to XL
deepen_factor
=
1.0
widen_factor
=
1.5
backbone
=
_base_
.
model
.
backbone
backbone
.
update
(
deepen_factor
=
deepen_factor
,
widen_factor
=
widen_factor
)
# model settings
model
=
dict
(
type
=
'YOLOWorldDetector'
,
mm_neck
=
True
,
num_train_classes
=
num_training_classes
,
num_test_classes
=
num_classes
,
data_preprocessor
=
dict
(
type
=
'YOLOWDetDataPreprocessor'
),
backbone
=
dict
(
_delete_
=
True
,
type
=
'MultiModalYOLOBackbone'
,
image_model
=
backbone
,
text_model
=
dict
(
type
=
'HuggingCLIPLanguageBackbone'
,
model_name
=
text_model_name
,
frozen_modules
=
[
'all'
])),
neck
=
dict
(
type
=
'YOLOWorldPAFPN'
,
deepen_factor
=
deepen_factor
,
widen_factor
=
widen_factor
,
guide_channels
=
text_channels
,
embed_channels
=
neck_embed_channels
,
num_heads
=
neck_num_heads
,
block_cfg
=
dict
(
type
=
'MaxSigmoidCSPLayerWithTwoConv'
)),
bbox_head
=
dict
(
type
=
'YOLOWorldHead'
,
head_module
=
dict
(
type
=
'YOLOWorldHeadModule'
,
widen_factor
=
widen_factor
,
use_bn_head
=
True
,
embed_dims
=
text_channels
,
num_classes
=
num_training_classes
)),
train_cfg
=
dict
(
assigner
=
dict
(
num_classes
=
num_training_classes
)))
# dataset settings
text_transform
=
[
dict
(
type
=
'RandomLoadText'
,
num_neg_samples
=
(
num_classes
,
num_classes
),
max_num_samples
=
num_training_classes
,
padding_to_max
=
True
,
padding_value
=
''
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'flip'
,
'flip_direction'
,
'texts'
))
]
mosaic_affine_transform
=
[
dict
(
type
=
'MultiModalMosaic'
,
img_scale
=
_base_
.
img_scale
,
pad_val
=
114.0
,
pre_transform
=
_base_
.
pre_transform
),
dict
(
type
=
'YOLOv5CopyPaste'
,
prob
=
_base_
.
copypaste_prob
),
dict
(
type
=
'YOLOv5RandomAffine'
,
max_rotate_degree
=
0.0
,
max_shear_degree
=
0.0
,
max_aspect_ratio
=
100.
,
scaling_ratio_range
=
(
1
-
_base_
.
affine_scale
,
1
+
_base_
.
affine_scale
),
# img_scale is (width, height)
border
=
(
-
_base_
.
img_scale
[
0
]
//
2
,
-
_base_
.
img_scale
[
1
]
//
2
),
border_val
=
(
114
,
114
,
114
),
min_area_ratio
=
_base_
.
min_area_ratio
,
use_mask_refine
=
_base_
.
use_mask2refine
)
]
train_pipeline
=
[
*
_base_
.
pre_transform
,
*
mosaic_affine_transform
,
dict
(
type
=
'YOLOv5MultiModalMixUp'
,
prob
=
_base_
.
mixup_prob
,
pre_transform
=
[
*
_base_
.
pre_transform
,
*
mosaic_affine_transform
]),
*
_base_
.
last_transform
[:
-
1
],
*
text_transform
]
train_pipeline_stage2
=
[
*
_base_
.
train_pipeline_stage2
[:
-
1
],
*
text_transform
]
coco_train_dataset
=
dict
(
_delete_
=
True
,
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5CocoDataset'
,
data_root
=
'data/coco'
,
ann_file
=
'annotations/instances_train2017.json'
,
data_prefix
=
dict
(
img
=
'train2017/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
)),
class_text_path
=
'data/texts/coco_class_texts.json'
,
pipeline
=
train_pipeline
)
train_dataloader
=
dict
(
persistent_workers
=
persistent_workers
,
batch_size
=
train_batch_size_per_gpu
,
collate_fn
=
dict
(
type
=
'yolow_collate'
),
dataset
=
coco_train_dataset
)
test_pipeline
=
[
*
_base_
.
test_pipeline
[:
-
1
],
dict
(
type
=
'LoadText'
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'scale_factor'
,
'pad_param'
,
'texts'
))
]
coco_val_dataset
=
dict
(
_delete_
=
True
,
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5CocoDataset'
,
data_root
=
'data/coco'
,
ann_file
=
'annotations/instances_val2017.json'
,
data_prefix
=
dict
(
img
=
'val2017/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
)),
class_text_path
=
'data/texts/coco_class_texts.json'
,
pipeline
=
test_pipeline
)
val_dataloader
=
dict
(
dataset
=
coco_val_dataset
)
test_dataloader
=
val_dataloader
# training settings
default_hooks
=
dict
(
param_scheduler
=
dict
(
scheduler_type
=
'linear'
,
lr_factor
=
0.01
,
max_epochs
=
max_epochs
),
checkpoint
=
dict
(
max_keep_ckpts
=-
1
,
save_best
=
None
,
interval
=
save_epoch_intervals
))
custom_hooks
=
[
dict
(
type
=
'EMAHook'
,
ema_type
=
'ExpMomentumEMA'
,
momentum
=
0.0001
,
update_buffers
=
True
,
strict_load
=
False
,
priority
=
49
),
dict
(
type
=
'mmdet.PipelineSwitchHook'
,
switch_epoch
=
max_epochs
-
close_mosaic_epochs
,
switch_pipeline
=
train_pipeline_stage2
)
]
train_cfg
=
dict
(
max_epochs
=
max_epochs
,
val_interval
=
5
,
dynamic_intervals
=
[((
max_epochs
-
close_mosaic_epochs
),
_base_
.
val_interval_stage2
)])
optim_wrapper
=
dict
(
optimizer
=
dict
(
_delete_
=
True
,
type
=
'AdamW'
,
lr
=
base_lr
,
weight_decay
=
weight_decay
,
batch_size_per_gpu
=
train_batch_size_per_gpu
),
paramwise_cfg
=
dict
(
bias_decay_mult
=
0.0
,
norm_decay_mult
=
0.0
,
custom_keys
=
{
'backbone.text_model'
:
dict
(
lr_mult
=
0.01
),
'logit_scale'
:
dict
(
weight_decay
=
0.0
)
}),
constructor
=
'YOLOWv5OptimizerConstructor'
)
# evaluation settings
val_evaluator
=
dict
(
_delete_
=
True
,
type
=
'mmdet.CocoMetric'
,
proposal_nums
=
(
100
,
1
,
10
),
ann_file
=
'data/coco/annotations/instances_val2017.json'
,
metric
=
'bbox'
)
configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_800ft_lvis_minival.py
0 → 100644
View file @
e9cee049
_base_
=
(
'../../third_party/mmyolo/configs/yolov8/'
'yolov8_l_syncbn_fast_8xb16-500e_coco.py'
)
custom_imports
=
dict
(
imports
=
[
'yolo_world'
],
allow_failed_imports
=
False
)
# hyper-parameters
num_classes
=
1203
num_training_classes
=
80
max_epochs
=
100
# Maximum training epochs
close_mosaic_epochs
=
2
save_epoch_intervals
=
2
text_channels
=
768
neck_embed_channels
=
[
128
,
256
,
_base_
.
last_stage_out_channels
//
2
]
neck_num_heads
=
[
4
,
8
,
_base_
.
last_stage_out_channels
//
2
//
32
]
base_lr
=
2e-3
weight_decay
=
0.0125
train_batch_size_per_gpu
=
16
# text_model_name = '../pretrained_models/clip-vit-large-patch14-336'
text_model_name
=
'openai/clip-vit-large-patch14-336'
img_scale
=
(
800
,
800
)
# model settings
model
=
dict
(
type
=
'YOLOWorldDetector'
,
mm_neck
=
True
,
num_train_classes
=
num_training_classes
,
num_test_classes
=
num_classes
,
data_preprocessor
=
dict
(
type
=
'YOLOWDetDataPreprocessor'
),
backbone
=
dict
(
_delete_
=
True
,
type
=
'MultiModalYOLOBackbone'
,
image_model
=
{{
_base_
.
model
.
backbone
}},
text_model
=
dict
(
type
=
'HuggingCLIPLanguageBackbone'
,
model_name
=
text_model_name
,
frozen_modules
=
[
'all'
])),
neck
=
dict
(
type
=
'YOLOWorldPAFPN'
,
guide_channels
=
text_channels
,
embed_channels
=
neck_embed_channels
,
num_heads
=
neck_num_heads
,
block_cfg
=
dict
(
type
=
'MaxSigmoidCSPLayerWithTwoConv'
)),
bbox_head
=
dict
(
type
=
'YOLOWorldHead'
,
head_module
=
dict
(
type
=
'YOLOWorldHeadModule'
,
use_bn_head
=
True
,
embed_dims
=
text_channels
,
num_classes
=
num_training_classes
)),
train_cfg
=
dict
(
assigner
=
dict
(
num_classes
=
num_training_classes
)))
# dataset settings
text_transform
=
[
dict
(
type
=
'RandomLoadText'
,
num_neg_samples
=
(
num_classes
,
num_classes
),
max_num_samples
=
num_training_classes
,
padding_to_max
=
True
,
padding_value
=
''
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'flip'
,
'flip_direction'
,
'texts'
))
]
train_pipeline
=
[
*
_base_
.
pre_transform
,
dict
(
type
=
'MultiModalMosaic'
,
img_scale
=
img_scale
,
pad_val
=
114.0
,
pre_transform
=
_base_
.
pre_transform
),
dict
(
type
=
'YOLOv5RandomAffine'
,
max_rotate_degree
=
0.0
,
max_shear_degree
=
0.0
,
scaling_ratio_range
=
(
1
-
_base_
.
affine_scale
,
1
+
_base_
.
affine_scale
),
max_aspect_ratio
=
_base_
.
max_aspect_ratio
,
border
=
(
-
img_scale
[
0
]
//
2
,
-
img_scale
[
1
]
//
2
),
border_val
=
(
114
,
114
,
114
)),
*
_base_
.
last_transform
[:
-
1
],
*
text_transform
,
]
train_pipeline_stage2
=
[
*
_base_
.
pre_transform
,
dict
(
type
=
'YOLOv5KeepRatioResize'
,
scale
=
img_scale
),
dict
(
type
=
'LetterResize'
,
scale
=
img_scale
,
allow_scale_up
=
True
,
pad_val
=
dict
(
img
=
114.0
)),
dict
(
type
=
'YOLOv5RandomAffine'
,
max_rotate_degree
=
0.0
,
max_shear_degree
=
0.0
,
scaling_ratio_range
=
(
1
-
_base_
.
affine_scale
,
1
+
_base_
.
affine_scale
),
max_aspect_ratio
=
_base_
.
max_aspect_ratio
,
border_val
=
(
114
,
114
,
114
)),
*
_base_
.
last_transform
[:
-
1
],
*
text_transform
]
obj365v1_train_dataset
=
dict
(
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5Objects365V1Dataset'
,
data_root
=
'data/objects365v1/'
,
ann_file
=
'annotations/objects365_train.json'
,
data_prefix
=
dict
(
img
=
'train/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
)),
class_text_path
=
'data/texts/obj365v1_class_texts.json'
,
pipeline
=
train_pipeline
)
mg_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/mixed_grounding/'
,
ann_file
=
'annotations/final_mixed_train_no_coco.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
),
pipeline
=
train_pipeline
)
flickr_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/flickr/'
,
ann_file
=
'annotations/final_flickr_separateGT_train.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
True
,
min_size
=
32
),
pipeline
=
train_pipeline
)
train_dataloader
=
dict
(
batch_size
=
train_batch_size_per_gpu
,
collate_fn
=
dict
(
type
=
'yolow_collate'
),
dataset
=
dict
(
_delete_
=
True
,
type
=
'ConcatDataset'
,
datasets
=
[
obj365v1_train_dataset
,
flickr_train_dataset
,
mg_train_dataset
],
ignore_keys
=
[
'classes'
,
'palette'
]))
test_pipeline
=
[
dict
(
type
=
'LoadImageFromFile'
),
dict
(
type
=
'YOLOv5KeepRatioResize'
,
scale
=
img_scale
),
dict
(
type
=
'LetterResize'
,
scale
=
img_scale
,
allow_scale_up
=
False
,
pad_val
=
dict
(
img
=
114
)),
dict
(
type
=
'LoadAnnotations'
,
with_bbox
=
True
,
_scope_
=
'mmdet'
),
dict
(
type
=
'LoadText'
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'scale_factor'
,
'pad_param'
,
'texts'
))
]
coco_val_dataset
=
dict
(
_delete_
=
True
,
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5LVISV1Dataset'
,
data_root
=
'data/coco/'
,
test_mode
=
True
,
ann_file
=
'lvis/lvis_v1_minival_inserted_image_name.json'
,
data_prefix
=
dict
(
img
=
''
),
batch_shapes_cfg
=
None
),
class_text_path
=
'data/texts/lvis_v1_class_texts.json'
,
pipeline
=
test_pipeline
)
val_dataloader
=
dict
(
dataset
=
coco_val_dataset
)
test_dataloader
=
val_dataloader
val_evaluator
=
dict
(
type
=
'mmdet.LVISMetric'
,
ann_file
=
'data/coco/lvis/lvis_v1_minival_inserted_image_name.json'
,
metric
=
'bbox'
)
test_evaluator
=
val_evaluator
# training settings
default_hooks
=
dict
(
param_scheduler
=
dict
(
max_epochs
=
max_epochs
),
checkpoint
=
dict
(
interval
=
save_epoch_intervals
,
rule
=
'greater'
))
custom_hooks
=
[
dict
(
type
=
'EMAHook'
,
ema_type
=
'ExpMomentumEMA'
,
momentum
=
0.0001
,
update_buffers
=
True
,
strict_load
=
False
,
priority
=
49
),
dict
(
type
=
'mmdet.PipelineSwitchHook'
,
switch_epoch
=
max_epochs
-
close_mosaic_epochs
,
switch_pipeline
=
train_pipeline_stage2
)
]
train_cfg
=
dict
(
max_epochs
=
max_epochs
,
val_interval
=
10
,
dynamic_intervals
=
[((
max_epochs
-
close_mosaic_epochs
),
_base_
.
val_interval_stage2
)])
optim_wrapper
=
dict
(
optimizer
=
dict
(
_delete_
=
True
,
type
=
'AdamW'
,
lr
=
base_lr
,
weight_decay
=
weight_decay
,
batch_size_per_gpu
=
train_batch_size_per_gpu
),
paramwise_cfg
=
dict
(
bias_decay_mult
=
0.0
,
norm_decay_mult
=
0.0
,
custom_keys
=
{
'backbone.text_model'
:
dict
(
lr_mult
=
0.01
),
'logit_scale'
:
dict
(
weight_decay
=
0.0
)
}),
constructor
=
'YOLOWv5OptimizerConstructor'
)
configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
0 → 100644
View file @
e9cee049
_base_
=
(
'../../third_party/mmyolo/configs/yolov8/'
'yolov8_l_syncbn_fast_8xb16-500e_coco.py'
)
custom_imports
=
dict
(
imports
=
[
'yolo_world'
],
allow_failed_imports
=
False
)
# hyper-parameters
num_classes
=
1203
num_training_classes
=
80
max_epochs
=
100
# Maximum training epochs
close_mosaic_epochs
=
2
save_epoch_intervals
=
2
text_channels
=
768
neck_embed_channels
=
[
128
,
256
,
_base_
.
last_stage_out_channels
//
2
]
neck_num_heads
=
[
4
,
8
,
_base_
.
last_stage_out_channels
//
2
//
32
]
base_lr
=
2e-3
weight_decay
=
0.0125
train_batch_size_per_gpu
=
16
# text_model_name = '../pretrained_models/clip-vit-large-patch14-336'
text_model_name
=
'openai/clip-vit-large-patch14-336'
# model settings
model
=
dict
(
type
=
'YOLOWorldDetector'
,
mm_neck
=
True
,
num_train_classes
=
num_training_classes
,
num_test_classes
=
num_classes
,
data_preprocessor
=
dict
(
type
=
'YOLOWDetDataPreprocessor'
),
backbone
=
dict
(
_delete_
=
True
,
type
=
'MultiModalYOLOBackbone'
,
image_model
=
{{
_base_
.
model
.
backbone
}},
text_model
=
dict
(
type
=
'HuggingCLIPLanguageBackbone'
,
model_name
=
text_model_name
,
frozen_modules
=
[
'all'
])),
neck
=
dict
(
type
=
'YOLOWorldPAFPN'
,
guide_channels
=
text_channels
,
embed_channels
=
neck_embed_channels
,
num_heads
=
neck_num_heads
,
block_cfg
=
dict
(
type
=
'MaxSigmoidCSPLayerWithTwoConv'
)),
bbox_head
=
dict
(
type
=
'YOLOWorldHead'
,
head_module
=
dict
(
type
=
'YOLOWorldHeadModule'
,
use_bn_head
=
True
,
embed_dims
=
text_channels
,
num_classes
=
num_training_classes
)),
train_cfg
=
dict
(
assigner
=
dict
(
num_classes
=
num_training_classes
)))
# dataset settings
text_transform
=
[
dict
(
type
=
'RandomLoadText'
,
num_neg_samples
=
(
num_classes
,
num_classes
),
max_num_samples
=
num_training_classes
,
padding_to_max
=
True
,
padding_value
=
''
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'flip'
,
'flip_direction'
,
'texts'
))
]
train_pipeline
=
[
*
_base_
.
pre_transform
,
dict
(
type
=
'MultiModalMosaic'
,
img_scale
=
_base_
.
img_scale
,
pad_val
=
114.0
,
pre_transform
=
_base_
.
pre_transform
),
dict
(
type
=
'YOLOv5RandomAffine'
,
max_rotate_degree
=
0.0
,
max_shear_degree
=
0.0
,
scaling_ratio_range
=
(
1
-
_base_
.
affine_scale
,
1
+
_base_
.
affine_scale
),
max_aspect_ratio
=
_base_
.
max_aspect_ratio
,
border
=
(
-
_base_
.
img_scale
[
0
]
//
2
,
-
_base_
.
img_scale
[
1
]
//
2
),
border_val
=
(
114
,
114
,
114
)),
*
_base_
.
last_transform
[:
-
1
],
*
text_transform
,
]
train_pipeline_stage2
=
[
*
_base_
.
train_pipeline_stage2
[:
-
1
],
*
text_transform
]
obj365v1_train_dataset
=
dict
(
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5Objects365V1Dataset'
,
data_root
=
'data/objects365v1/'
,
ann_file
=
'annotations/objects365_train.json'
,
data_prefix
=
dict
(
img
=
'train/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
)),
class_text_path
=
'data/texts/obj365v1_class_texts.json'
,
pipeline
=
train_pipeline
)
mg_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/mixed_grounding/'
,
ann_file
=
'annotations/final_mixed_train_no_coco.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
),
pipeline
=
train_pipeline
)
flickr_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/flickr/'
,
ann_file
=
'annotations/final_flickr_separateGT_train.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
True
,
min_size
=
32
),
pipeline
=
train_pipeline
)
train_dataloader
=
dict
(
batch_size
=
train_batch_size_per_gpu
,
collate_fn
=
dict
(
type
=
'yolow_collate'
),
dataset
=
dict
(
_delete_
=
True
,
type
=
'ConcatDataset'
,
datasets
=
[
obj365v1_train_dataset
,
flickr_train_dataset
,
mg_train_dataset
],
ignore_keys
=
[
'classes'
,
'palette'
]))
test_pipeline
=
[
*
_base_
.
test_pipeline
[:
-
1
],
dict
(
type
=
'LoadText'
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'scale_factor'
,
'pad_param'
,
'texts'
))
]
coco_val_dataset
=
dict
(
_delete_
=
True
,
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5LVISV1Dataset'
,
data_root
=
'data/coco/'
,
test_mode
=
True
,
ann_file
=
'lvis/lvis_v1_minival_inserted_image_name.json'
,
data_prefix
=
dict
(
img
=
''
),
batch_shapes_cfg
=
None
),
class_text_path
=
'data/texts/lvis_v1_class_texts.json'
,
pipeline
=
test_pipeline
)
val_dataloader
=
dict
(
dataset
=
coco_val_dataset
)
test_dataloader
=
val_dataloader
val_evaluator
=
dict
(
type
=
'mmdet.LVISMetric'
,
ann_file
=
'data/coco/lvis/lvis_v1_minival_inserted_image_name.json'
,
metric
=
'bbox'
)
test_evaluator
=
val_evaluator
# training settings
default_hooks
=
dict
(
param_scheduler
=
dict
(
max_epochs
=
max_epochs
),
checkpoint
=
dict
(
interval
=
save_epoch_intervals
,
rule
=
'greater'
))
custom_hooks
=
[
dict
(
type
=
'EMAHook'
,
ema_type
=
'ExpMomentumEMA'
,
momentum
=
0.0001
,
update_buffers
=
True
,
strict_load
=
False
,
priority
=
49
),
dict
(
type
=
'mmdet.PipelineSwitchHook'
,
switch_epoch
=
max_epochs
-
close_mosaic_epochs
,
switch_pipeline
=
train_pipeline_stage2
)
]
train_cfg
=
dict
(
max_epochs
=
max_epochs
,
val_interval
=
10
,
dynamic_intervals
=
[((
max_epochs
-
close_mosaic_epochs
),
_base_
.
val_interval_stage2
)])
optim_wrapper
=
dict
(
optimizer
=
dict
(
_delete_
=
True
,
type
=
'AdamW'
,
lr
=
base_lr
,
weight_decay
=
weight_decay
,
batch_size_per_gpu
=
train_batch_size_per_gpu
),
paramwise_cfg
=
dict
(
bias_decay_mult
=
0.0
,
norm_decay_mult
=
0.0
,
custom_keys
=
{
'backbone.text_model'
:
dict
(
lr_mult
=
0.01
),
'logit_scale'
:
dict
(
weight_decay
=
0.0
)
}),
constructor
=
'YOLOWv5OptimizerConstructor'
)
configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
0 → 100644
View file @
e9cee049
_base_
=
(
'../../third_party/mmyolo/configs/yolov8/'
'yolov8_l_syncbn_fast_8xb16-500e_coco.py'
)
custom_imports
=
dict
(
imports
=
[
'yolo_world'
],
allow_failed_imports
=
False
)
# hyper-parameters
num_classes
=
1203
num_training_classes
=
80
max_epochs
=
20
# Maximum training epochs
close_mosaic_epochs
=
2
save_epoch_intervals
=
2
text_channels
=
512
neck_embed_channels
=
[
128
,
256
,
_base_
.
last_stage_out_channels
//
2
]
neck_num_heads
=
[
4
,
8
,
_base_
.
last_stage_out_channels
//
2
//
32
]
base_lr
=
2e-4
weight_decay
=
0.025
train_batch_size_per_gpu
=
4
load_from
=
"pretrained_models/yolo_world_v2_l_obj365v1_goldg_pretrain-a82b1fe3.pth"
# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
text_model_name
=
'openai/clip-vit-base-patch32'
img_scale
=
(
1280
,
1280
)
# model settings
model
=
dict
(
type
=
'YOLOWorldDetector'
,
mm_neck
=
True
,
num_train_classes
=
num_training_classes
,
num_test_classes
=
num_classes
,
data_preprocessor
=
dict
(
type
=
'YOLOWDetDataPreprocessor'
),
backbone
=
dict
(
_delete_
=
True
,
type
=
'MultiModalYOLOBackbone'
,
image_model
=
{{
_base_
.
model
.
backbone
}},
text_model
=
dict
(
type
=
'HuggingCLIPLanguageBackbone'
,
model_name
=
text_model_name
,
frozen_modules
=
[
'all'
])),
neck
=
dict
(
type
=
'YOLOWorldPAFPN'
,
guide_channels
=
text_channels
,
embed_channels
=
neck_embed_channels
,
num_heads
=
neck_num_heads
,
block_cfg
=
dict
(
type
=
'MaxSigmoidCSPLayerWithTwoConv'
)),
bbox_head
=
dict
(
type
=
'YOLOWorldHead'
,
head_module
=
dict
(
type
=
'YOLOWorldHeadModule'
,
use_bn_head
=
True
,
embed_dims
=
text_channels
,
num_classes
=
num_training_classes
)),
train_cfg
=
dict
(
assigner
=
dict
(
num_classes
=
num_training_classes
)))
# dataset settings
text_transform
=
[
dict
(
type
=
'RandomLoadText'
,
num_neg_samples
=
(
num_classes
,
num_classes
),
max_num_samples
=
num_training_classes
,
padding_to_max
=
True
,
padding_value
=
''
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'flip'
,
'flip_direction'
,
'texts'
))
]
train_pipeline
=
[
*
_base_
.
pre_transform
,
dict
(
type
=
'MultiModalMosaic'
,
img_scale
=
img_scale
,
pad_val
=
114.0
,
pre_transform
=
_base_
.
pre_transform
),
dict
(
type
=
'YOLOv5RandomAffine'
,
max_rotate_degree
=
0.0
,
max_shear_degree
=
0.0
,
scaling_ratio_range
=
(
1
-
_base_
.
affine_scale
,
1
+
_base_
.
affine_scale
),
max_aspect_ratio
=
_base_
.
max_aspect_ratio
,
border
=
(
-
img_scale
[
0
]
//
2
,
-
img_scale
[
1
]
//
2
),
border_val
=
(
114
,
114
,
114
)),
*
_base_
.
last_transform
[:
-
1
],
*
text_transform
,
]
train_pipeline_stage2
=
[
*
_base_
.
pre_transform
,
dict
(
type
=
'YOLOv5KeepRatioResize'
,
scale
=
img_scale
),
dict
(
type
=
'LetterResize'
,
scale
=
img_scale
,
allow_scale_up
=
True
,
pad_val
=
dict
(
img
=
114.0
)),
dict
(
type
=
'YOLOv5RandomAffine'
,
max_rotate_degree
=
0.0
,
max_shear_degree
=
0.0
,
scaling_ratio_range
=
(
1
-
_base_
.
affine_scale
,
1
+
_base_
.
affine_scale
),
max_aspect_ratio
=
_base_
.
max_aspect_ratio
,
border_val
=
(
114
,
114
,
114
)),
*
_base_
.
last_transform
[:
-
1
],
*
text_transform
]
obj365v1_train_dataset
=
dict
(
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5Objects365V1Dataset'
,
data_root
=
'data/objects365v1/'
,
ann_file
=
'annotations/objects365_train.json'
,
data_prefix
=
dict
(
img
=
'train/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
)),
class_text_path
=
'data/texts/obj365v1_class_texts.json'
,
pipeline
=
train_pipeline
)
mg_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/mixed_grounding/'
,
ann_file
=
'annotations/final_mixed_train_no_coco.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
),
pipeline
=
train_pipeline
)
flickr_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/flickr/'
,
ann_file
=
'annotations/final_flickr_separateGT_train.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
True
,
min_size
=
32
),
pipeline
=
train_pipeline
)
train_dataloader
=
dict
(
batch_size
=
train_batch_size_per_gpu
,
collate_fn
=
dict
(
type
=
'yolow_collate'
),
dataset
=
dict
(
_delete_
=
True
,
type
=
'ConcatDataset'
,
datasets
=
[
obj365v1_train_dataset
,
flickr_train_dataset
,
mg_train_dataset
],
ignore_keys
=
[
'classes'
,
'palette'
]))
test_pipeline
=
[
dict
(
type
=
'LoadImageFromFile'
),
dict
(
type
=
'YOLOv5KeepRatioResize'
,
scale
=
img_scale
),
dict
(
type
=
'LetterResize'
,
scale
=
img_scale
,
allow_scale_up
=
False
,
pad_val
=
dict
(
img
=
114
)),
dict
(
type
=
'LoadAnnotations'
,
with_bbox
=
True
,
_scope_
=
'mmdet'
),
dict
(
type
=
'LoadText'
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'scale_factor'
,
'pad_param'
,
'texts'
))
]
coco_val_dataset
=
dict
(
_delete_
=
True
,
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5LVISV1Dataset'
,
data_root
=
'data/coco/'
,
test_mode
=
True
,
ann_file
=
'lvis/lvis_v1_minival_inserted_image_name.json'
,
data_prefix
=
dict
(
img
=
''
),
batch_shapes_cfg
=
None
),
class_text_path
=
'data/texts/lvis_v1_class_texts.json'
,
pipeline
=
test_pipeline
)
val_dataloader
=
dict
(
dataset
=
coco_val_dataset
)
test_dataloader
=
val_dataloader
val_evaluator
=
dict
(
type
=
'mmdet.LVISMetric'
,
ann_file
=
'data/coco/lvis/lvis_v1_minival_inserted_image_name.json'
,
metric
=
'bbox'
)
test_evaluator
=
val_evaluator
# training settings
default_hooks
=
dict
(
param_scheduler
=
dict
(
max_epochs
=
max_epochs
),
checkpoint
=
dict
(
interval
=
save_epoch_intervals
,
rule
=
'greater'
))
custom_hooks
=
[
dict
(
type
=
'EMAHook'
,
ema_type
=
'ExpMomentumEMA'
,
momentum
=
0.0001
,
update_buffers
=
True
,
strict_load
=
False
,
priority
=
49
),
dict
(
type
=
'mmdet.PipelineSwitchHook'
,
switch_epoch
=
max_epochs
-
close_mosaic_epochs
,
switch_pipeline
=
train_pipeline_stage2
)
]
train_cfg
=
dict
(
max_epochs
=
max_epochs
,
val_interval
=
10
,
dynamic_intervals
=
[((
max_epochs
-
close_mosaic_epochs
),
_base_
.
val_interval_stage2
)])
optim_wrapper
=
dict
(
optimizer
=
dict
(
_delete_
=
True
,
type
=
'AdamW'
,
lr
=
base_lr
,
weight_decay
=
weight_decay
,
batch_size_per_gpu
=
train_batch_size_per_gpu
),
paramwise_cfg
=
dict
(
bias_decay_mult
=
0.0
,
norm_decay_mult
=
0.0
,
custom_keys
=
{
'backbone.text_model'
:
dict
(
lr_mult
=
0.01
),
'logit_scale'
:
dict
(
weight_decay
=
0.0
)
}),
constructor
=
'YOLOWv5OptimizerConstructor'
)
configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
0 → 100644
View file @
e9cee049
_base_
=
(
'../../third_party/mmyolo/configs/yolov8/'
'yolov8_l_syncbn_fast_8xb16-500e_coco.py'
)
custom_imports
=
dict
(
imports
=
[
'yolo_world'
],
allow_failed_imports
=
False
)
# hyper-parameters
num_classes
=
1203
num_training_classes
=
80
max_epochs
=
100
# Maximum training epochs
close_mosaic_epochs
=
2
save_epoch_intervals
=
2
text_channels
=
512
neck_embed_channels
=
[
128
,
256
,
_base_
.
last_stage_out_channels
//
2
]
neck_num_heads
=
[
4
,
8
,
_base_
.
last_stage_out_channels
//
2
//
32
]
base_lr
=
2e-3
weight_decay
=
0.05
/
2
train_batch_size_per_gpu
=
16
# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
text_model_name
=
'openai/clip-vit-base-patch32'
# model settings
model
=
dict
(
type
=
'YOLOWorldDetector'
,
mm_neck
=
True
,
num_train_classes
=
num_training_classes
,
num_test_classes
=
num_classes
,
data_preprocessor
=
dict
(
type
=
'YOLOWDetDataPreprocessor'
),
backbone
=
dict
(
_delete_
=
True
,
type
=
'MultiModalYOLOBackbone'
,
image_model
=
{{
_base_
.
model
.
backbone
}},
text_model
=
dict
(
type
=
'HuggingCLIPLanguageBackbone'
,
model_name
=
text_model_name
,
frozen_modules
=
[
'all'
])),
neck
=
dict
(
type
=
'YOLOWorldPAFPN'
,
guide_channels
=
text_channels
,
embed_channels
=
neck_embed_channels
,
num_heads
=
neck_num_heads
,
block_cfg
=
dict
(
type
=
'MaxSigmoidCSPLayerWithTwoConv'
)),
bbox_head
=
dict
(
type
=
'YOLOWorldHead'
,
head_module
=
dict
(
type
=
'YOLOWorldHeadModule'
,
use_bn_head
=
True
,
embed_dims
=
text_channels
,
num_classes
=
num_training_classes
)),
train_cfg
=
dict
(
assigner
=
dict
(
num_classes
=
num_training_classes
)))
# dataset settings
text_transform
=
[
dict
(
type
=
'RandomLoadText'
,
num_neg_samples
=
(
num_classes
,
num_classes
),
max_num_samples
=
num_training_classes
,
padding_to_max
=
True
,
padding_value
=
''
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'flip'
,
'flip_direction'
,
'texts'
))
]
train_pipeline
=
[
*
_base_
.
pre_transform
,
dict
(
type
=
'MultiModalMosaic'
,
img_scale
=
_base_
.
img_scale
,
pad_val
=
114.0
,
pre_transform
=
_base_
.
pre_transform
),
dict
(
type
=
'YOLOv5RandomAffine'
,
max_rotate_degree
=
0.0
,
max_shear_degree
=
0.0
,
scaling_ratio_range
=
(
1
-
_base_
.
affine_scale
,
1
+
_base_
.
affine_scale
),
max_aspect_ratio
=
_base_
.
max_aspect_ratio
,
border
=
(
-
_base_
.
img_scale
[
0
]
//
2
,
-
_base_
.
img_scale
[
1
]
//
2
),
border_val
=
(
114
,
114
,
114
)),
*
_base_
.
last_transform
[:
-
1
],
*
text_transform
,
]
train_pipeline_stage2
=
[
*
_base_
.
train_pipeline_stage2
[:
-
1
],
*
text_transform
]
obj365v1_train_dataset
=
dict
(
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5Objects365V1Dataset'
,
data_root
=
'data/objects365v1/'
,
ann_file
=
'annotations/objects365_train.json'
,
data_prefix
=
dict
(
img
=
'train/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
)),
class_text_path
=
'data/texts/obj365v1_class_texts.json'
,
pipeline
=
train_pipeline
)
mg_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/mixed_grounding/'
,
ann_file
=
'annotations/final_mixed_train_no_coco.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
),
pipeline
=
train_pipeline
)
flickr_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/flickr/'
,
ann_file
=
'annotations/final_flickr_separateGT_train.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
True
,
min_size
=
32
),
pipeline
=
train_pipeline
)
train_dataloader
=
dict
(
batch_size
=
train_batch_size_per_gpu
,
collate_fn
=
dict
(
type
=
'yolow_collate'
),
dataset
=
dict
(
_delete_
=
True
,
type
=
'ConcatDataset'
,
datasets
=
[
obj365v1_train_dataset
,
flickr_train_dataset
,
mg_train_dataset
],
ignore_keys
=
[
'classes'
,
'palette'
]))
test_pipeline
=
[
*
_base_
.
test_pipeline
[:
-
1
],
dict
(
type
=
'LoadText'
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'scale_factor'
,
'pad_param'
,
'texts'
))
]
coco_val_dataset
=
dict
(
_delete_
=
True
,
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5LVISV1Dataset'
,
data_root
=
'data/coco/'
,
test_mode
=
True
,
ann_file
=
'lvis/lvis_v1_minival_inserted_image_name.json'
,
data_prefix
=
dict
(
img
=
''
),
batch_shapes_cfg
=
None
),
class_text_path
=
'data/texts/lvis_v1_class_texts.json'
,
pipeline
=
test_pipeline
)
val_dataloader
=
dict
(
dataset
=
coco_val_dataset
)
test_dataloader
=
val_dataloader
val_evaluator
=
dict
(
type
=
'mmdet.LVISMetric'
,
ann_file
=
'data/coco/lvis/lvis_v1_minival_inserted_image_name.json'
,
metric
=
'bbox'
)
test_evaluator
=
val_evaluator
# training settings
default_hooks
=
dict
(
param_scheduler
=
dict
(
max_epochs
=
max_epochs
),
checkpoint
=
dict
(
interval
=
save_epoch_intervals
,
rule
=
'greater'
))
custom_hooks
=
[
dict
(
type
=
'EMAHook'
,
ema_type
=
'ExpMomentumEMA'
,
momentum
=
0.0001
,
update_buffers
=
True
,
strict_load
=
False
,
priority
=
49
),
dict
(
type
=
'mmdet.PipelineSwitchHook'
,
switch_epoch
=
max_epochs
-
close_mosaic_epochs
,
switch_pipeline
=
train_pipeline_stage2
)
]
train_cfg
=
dict
(
max_epochs
=
max_epochs
,
val_interval
=
10
,
dynamic_intervals
=
[((
max_epochs
-
close_mosaic_epochs
),
_base_
.
val_interval_stage2
)])
optim_wrapper
=
dict
(
optimizer
=
dict
(
_delete_
=
True
,
type
=
'AdamW'
,
lr
=
base_lr
,
weight_decay
=
weight_decay
,
batch_size_per_gpu
=
train_batch_size_per_gpu
),
paramwise_cfg
=
dict
(
bias_decay_mult
=
0.0
,
norm_decay_mult
=
0.0
,
custom_keys
=
{
'backbone.text_model'
:
dict
(
lr_mult
=
0.01
),
'logit_scale'
:
dict
(
weight_decay
=
0.0
)
}),
constructor
=
'YOLOWv5OptimizerConstructor'
)
configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py
0 → 100644
View file @
e9cee049
_base_
=
(
'../../third_party/mmyolo/configs/yolov8/'
'yolov8_l_syncbn_fast_8xb16-500e_coco.py'
)
custom_imports
=
dict
(
imports
=
[
'yolo_world'
],
allow_failed_imports
=
False
)
# hyper-parameters
num_classes
=
1203
num_training_classes
=
80
max_epochs
=
100
# Maximum training epochs
close_mosaic_epochs
=
2
save_epoch_intervals
=
2
text_channels
=
512
neck_embed_channels
=
[
128
,
256
,
_base_
.
last_stage_out_channels
//
2
]
neck_num_heads
=
[
4
,
8
,
_base_
.
last_stage_out_channels
//
2
//
32
]
base_lr
=
2e-3
weight_decay
=
0.05
/
2
train_batch_size_per_gpu
=
16
# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
text_model_name
=
'openai/clip-vit-base-patch32'
# model settings
model
=
dict
(
type
=
'YOLOWorldDetector'
,
mm_neck
=
True
,
num_train_classes
=
num_training_classes
,
num_test_classes
=
num_classes
,
data_preprocessor
=
dict
(
type
=
'YOLOWDetDataPreprocessor'
),
backbone
=
dict
(
_delete_
=
True
,
type
=
'MultiModalYOLOBackbone'
,
image_model
=
{{
_base_
.
model
.
backbone
}},
text_model
=
dict
(
type
=
'HuggingCLIPLanguageBackbone'
,
model_name
=
text_model_name
,
frozen_modules
=
[
'all'
])),
neck
=
dict
(
type
=
'YOLOWorldPAFPN'
,
guide_channels
=
text_channels
,
embed_channels
=
neck_embed_channels
,
num_heads
=
neck_num_heads
,
block_cfg
=
dict
(
type
=
'MaxSigmoidCSPLayerWithTwoConv'
)),
bbox_head
=
dict
(
type
=
'YOLOWorldHead'
,
head_module
=
dict
(
type
=
'YOLOWorldHeadModule'
,
use_bn_head
=
True
,
embed_dims
=
text_channels
,
num_classes
=
num_training_classes
)),
train_cfg
=
dict
(
assigner
=
dict
(
num_classes
=
num_training_classes
)))
# dataset settings
text_transform
=
[
dict
(
type
=
'RandomLoadText'
,
num_neg_samples
=
(
num_classes
,
num_classes
),
max_num_samples
=
num_training_classes
,
padding_to_max
=
True
,
padding_value
=
''
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'flip'
,
'flip_direction'
,
'texts'
))
]
train_pipeline
=
[
*
_base_
.
pre_transform
,
dict
(
type
=
'MultiModalMosaic'
,
img_scale
=
_base_
.
img_scale
,
pad_val
=
114.0
,
pre_transform
=
_base_
.
pre_transform
),
dict
(
type
=
'YOLOv5RandomAffine'
,
max_rotate_degree
=
0.0
,
max_shear_degree
=
0.0
,
scaling_ratio_range
=
(
1
-
_base_
.
affine_scale
,
1
+
_base_
.
affine_scale
),
max_aspect_ratio
=
_base_
.
max_aspect_ratio
,
border
=
(
-
_base_
.
img_scale
[
0
]
//
2
,
-
_base_
.
img_scale
[
1
]
//
2
),
border_val
=
(
114
,
114
,
114
)),
*
_base_
.
last_transform
[:
-
1
],
*
text_transform
,
]
train_pipeline_stage2
=
[
*
_base_
.
train_pipeline_stage2
[:
-
1
],
*
text_transform
]
obj365v1_train_dataset
=
dict
(
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5Objects365V1Dataset'
,
data_root
=
'data/objects365v1/'
,
ann_file
=
'annotations/objects365_train.json'
,
data_prefix
=
dict
(
img
=
'train/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
)),
class_text_path
=
'data/texts/obj365v1_class_texts.json'
,
pipeline
=
train_pipeline
)
mg_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/mixed_grounding/'
,
ann_file
=
'annotations/final_mixed_train_no_coco.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
),
pipeline
=
train_pipeline
)
flickr_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/flickr/'
,
ann_file
=
'annotations/final_flickr_separateGT_train.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
True
,
min_size
=
32
),
pipeline
=
train_pipeline
)
train_dataloader
=
dict
(
batch_size
=
train_batch_size_per_gpu
,
collate_fn
=
dict
(
type
=
'yolow_collate'
),
dataset
=
dict
(
_delete_
=
True
,
type
=
'ConcatDataset'
,
datasets
=
[
obj365v1_train_dataset
,
flickr_train_dataset
,
mg_train_dataset
],
ignore_keys
=
[
'classes'
,
'palette'
]))
test_pipeline
=
[
*
_base_
.
test_pipeline
[:
-
1
],
dict
(
type
=
'LoadText'
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'scale_factor'
,
'pad_param'
,
'texts'
))
]
coco_val_dataset
=
dict
(
_delete_
=
True
,
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5LVISV1Dataset'
,
data_root
=
'data/coco/'
,
test_mode
=
True
,
ann_file
=
'lvis/lvis_v1_val.json'
,
data_prefix
=
dict
(
img
=
''
),
batch_shapes_cfg
=
None
),
class_text_path
=
'data/texts/lvis_v1_class_texts.json'
,
pipeline
=
test_pipeline
)
val_dataloader
=
dict
(
dataset
=
coco_val_dataset
)
test_dataloader
=
val_dataloader
val_evaluator
=
dict
(
type
=
'mmdet.LVISMetric'
,
ann_file
=
'data/coco/lvis/lvis_v1_val.json'
,
metric
=
'bbox'
)
test_evaluator
=
val_evaluator
# training settings
default_hooks
=
dict
(
param_scheduler
=
dict
(
max_epochs
=
max_epochs
),
checkpoint
=
dict
(
interval
=
save_epoch_intervals
,
rule
=
'greater'
))
custom_hooks
=
[
dict
(
type
=
'EMAHook'
,
ema_type
=
'ExpMomentumEMA'
,
momentum
=
0.0001
,
update_buffers
=
True
,
strict_load
=
False
,
priority
=
49
),
dict
(
type
=
'mmdet.PipelineSwitchHook'
,
switch_epoch
=
max_epochs
-
close_mosaic_epochs
,
switch_pipeline
=
train_pipeline_stage2
)
]
train_cfg
=
dict
(
max_epochs
=
max_epochs
,
val_interval
=
10
,
dynamic_intervals
=
[((
max_epochs
-
close_mosaic_epochs
),
_base_
.
val_interval_stage2
)])
optim_wrapper
=
dict
(
optimizer
=
dict
(
_delete_
=
True
,
type
=
'AdamW'
,
lr
=
base_lr
,
weight_decay
=
weight_decay
,
batch_size_per_gpu
=
train_batch_size_per_gpu
),
paramwise_cfg
=
dict
(
bias_decay_mult
=
0.0
,
norm_decay_mult
=
0.0
,
custom_keys
=
{
'backbone.text_model'
:
dict
(
lr_mult
=
0.01
),
'logit_scale'
:
dict
(
weight_decay
=
0.0
)
}),
constructor
=
'YOLOWv5OptimizerConstructor'
)
configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
0 → 100644
View file @
e9cee049
_base_
=
(
'../../third_party/mmyolo/configs/yolov8/'
'yolov8_m_syncbn_fast_8xb16-500e_coco.py'
)
custom_imports
=
dict
(
imports
=
[
'yolo_world'
],
allow_failed_imports
=
False
)
# hyper-parameters
num_classes
=
1203
num_training_classes
=
80
max_epochs
=
100
# Maximum training epochs
close_mosaic_epochs
=
2
save_epoch_intervals
=
2
text_channels
=
512
neck_embed_channels
=
[
128
,
256
,
_base_
.
last_stage_out_channels
//
2
]
neck_num_heads
=
[
4
,
8
,
_base_
.
last_stage_out_channels
//
2
//
32
]
base_lr
=
2e-3
weight_decay
=
0.05
/
2
train_batch_size_per_gpu
=
16
# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
img_scale
=
(
1280
,
1280
)
text_model_name
=
'openai/clip-vit-base-patch32'
# model settings
model
=
dict
(
type
=
'YOLOWorldDetector'
,
mm_neck
=
True
,
num_train_classes
=
num_training_classes
,
num_test_classes
=
num_classes
,
data_preprocessor
=
dict
(
type
=
'YOLOWDetDataPreprocessor'
),
backbone
=
dict
(
_delete_
=
True
,
type
=
'MultiModalYOLOBackbone'
,
image_model
=
{{
_base_
.
model
.
backbone
}},
text_model
=
dict
(
type
=
'HuggingCLIPLanguageBackbone'
,
model_name
=
text_model_name
,
frozen_modules
=
[
'all'
])),
neck
=
dict
(
type
=
'YOLOWorldPAFPN'
,
guide_channels
=
text_channels
,
embed_channels
=
neck_embed_channels
,
num_heads
=
neck_num_heads
,
block_cfg
=
dict
(
type
=
'MaxSigmoidCSPLayerWithTwoConv'
)),
bbox_head
=
dict
(
type
=
'YOLOWorldHead'
,
head_module
=
dict
(
type
=
'YOLOWorldHeadModule'
,
use_bn_head
=
True
,
embed_dims
=
text_channels
,
num_classes
=
num_training_classes
)),
train_cfg
=
dict
(
assigner
=
dict
(
num_classes
=
num_training_classes
)))
# dataset settings
text_transform
=
[
dict
(
type
=
'RandomLoadText'
,
num_neg_samples
=
(
num_classes
,
num_classes
),
max_num_samples
=
num_training_classes
,
padding_to_max
=
True
,
padding_value
=
''
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'flip'
,
'flip_direction'
,
'texts'
))
]
train_pipeline
=
[
*
_base_
.
pre_transform
,
dict
(
type
=
'MultiModalMosaic'
,
img_scale
=
img_scale
,
pad_val
=
114.0
,
pre_transform
=
_base_
.
pre_transform
),
dict
(
type
=
'YOLOv5RandomAffine'
,
max_rotate_degree
=
0.0
,
max_shear_degree
=
0.0
,
scaling_ratio_range
=
(
1
-
_base_
.
affine_scale
,
1
+
_base_
.
affine_scale
),
max_aspect_ratio
=
_base_
.
max_aspect_ratio
,
border
=
(
-
img_scale
[
0
]
//
2
,
-
img_scale
[
1
]
//
2
),
border_val
=
(
114
,
114
,
114
)),
*
_base_
.
last_transform
[:
-
1
],
*
text_transform
,
]
train_pipeline_stage2
=
[
*
_base_
.
pre_transform
,
dict
(
type
=
'YOLOv5KeepRatioResize'
,
scale
=
img_scale
),
dict
(
type
=
'LetterResize'
,
scale
=
img_scale
,
allow_scale_up
=
True
,
pad_val
=
dict
(
img
=
114.0
)),
dict
(
type
=
'YOLOv5RandomAffine'
,
max_rotate_degree
=
0.0
,
max_shear_degree
=
0.0
,
scaling_ratio_range
=
(
1
-
_base_
.
affine_scale
,
1
+
_base_
.
affine_scale
),
max_aspect_ratio
=
_base_
.
max_aspect_ratio
,
border_val
=
(
114
,
114
,
114
)),
*
_base_
.
last_transform
[:
-
1
],
*
text_transform
]
obj365v1_train_dataset
=
dict
(
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5Objects365V1Dataset'
,
data_root
=
'data/objects365v1/'
,
ann_file
=
'annotations/objects365_train.json'
,
data_prefix
=
dict
(
img
=
'train/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
)),
class_text_path
=
'data/texts/obj365v1_class_texts.json'
,
pipeline
=
train_pipeline
)
mg_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/mixed_grounding/'
,
ann_file
=
'annotations/final_mixed_train_no_coco.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
),
pipeline
=
train_pipeline
)
flickr_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/flickr/'
,
ann_file
=
'annotations/final_flickr_separateGT_train.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
True
,
min_size
=
32
),
pipeline
=
train_pipeline
)
train_dataloader
=
dict
(
batch_size
=
train_batch_size_per_gpu
,
collate_fn
=
dict
(
type
=
'yolow_collate'
),
dataset
=
dict
(
_delete_
=
True
,
type
=
'ConcatDataset'
,
datasets
=
[
obj365v1_train_dataset
,
flickr_train_dataset
,
mg_train_dataset
],
ignore_keys
=
[
'classes'
,
'palette'
]))
test_pipeline
=
[
dict
(
type
=
'LoadImageFromFile'
),
dict
(
type
=
'YOLOv5KeepRatioResize'
,
scale
=
img_scale
),
dict
(
type
=
'LetterResize'
,
scale
=
img_scale
,
allow_scale_up
=
False
,
pad_val
=
dict
(
img
=
114
)),
dict
(
type
=
'LoadAnnotations'
,
with_bbox
=
True
,
_scope_
=
'mmdet'
),
dict
(
type
=
'LoadText'
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'scale_factor'
,
'pad_param'
,
'texts'
))
]
coco_val_dataset
=
dict
(
_delete_
=
True
,
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5LVISV1Dataset'
,
data_root
=
'data/coco/'
,
test_mode
=
True
,
ann_file
=
'lvis/lvis_v1_minival_inserted_image_name.json'
,
data_prefix
=
dict
(
img
=
''
),
batch_shapes_cfg
=
None
),
class_text_path
=
'data/texts/lvis_v1_class_texts.json'
,
pipeline
=
test_pipeline
)
val_dataloader
=
dict
(
dataset
=
coco_val_dataset
)
test_dataloader
=
val_dataloader
val_evaluator
=
dict
(
type
=
'mmdet.LVISMetric'
,
ann_file
=
'data/coco/lvis/lvis_v1_minival_inserted_image_name.json'
,
metric
=
'bbox'
)
test_evaluator
=
val_evaluator
# training settings
default_hooks
=
dict
(
param_scheduler
=
dict
(
max_epochs
=
max_epochs
),
checkpoint
=
dict
(
interval
=
save_epoch_intervals
,
rule
=
'greater'
))
custom_hooks
=
[
dict
(
type
=
'EMAHook'
,
ema_type
=
'ExpMomentumEMA'
,
momentum
=
0.0001
,
update_buffers
=
True
,
strict_load
=
False
,
priority
=
49
),
dict
(
type
=
'mmdet.PipelineSwitchHook'
,
switch_epoch
=
max_epochs
-
close_mosaic_epochs
,
switch_pipeline
=
train_pipeline_stage2
)
]
train_cfg
=
dict
(
max_epochs
=
max_epochs
,
val_interval
=
10
,
dynamic_intervals
=
[((
max_epochs
-
close_mosaic_epochs
),
_base_
.
val_interval_stage2
)])
optim_wrapper
=
dict
(
optimizer
=
dict
(
_delete_
=
True
,
type
=
'AdamW'
,
lr
=
base_lr
,
weight_decay
=
weight_decay
,
batch_size_per_gpu
=
train_batch_size_per_gpu
),
paramwise_cfg
=
dict
(
bias_decay_mult
=
0.0
,
norm_decay_mult
=
0.0
,
custom_keys
=
{
'backbone.text_model'
:
dict
(
lr_mult
=
0.01
),
'logit_scale'
:
dict
(
weight_decay
=
0.0
)
}),
constructor
=
'YOLOWv5OptimizerConstructor'
)
configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
0 → 100644
View file @
e9cee049
_base_
=
(
'../../third_party/mmyolo/configs/yolov8/'
'yolov8_m_syncbn_fast_8xb16-500e_coco.py'
)
custom_imports
=
dict
(
imports
=
[
'yolo_world'
],
allow_failed_imports
=
False
)
# hyper-parameters
num_classes
=
1203
num_training_classes
=
80
max_epochs
=
100
# Maximum training epochs
close_mosaic_epochs
=
2
save_epoch_intervals
=
2
text_channels
=
512
neck_embed_channels
=
[
128
,
256
,
_base_
.
last_stage_out_channels
//
2
]
neck_num_heads
=
[
4
,
8
,
_base_
.
last_stage_out_channels
//
2
//
32
]
base_lr
=
2e-3
weight_decay
=
0.05
/
2
train_batch_size_per_gpu
=
16
text_model_name
=
'openai/clip-vit-large-patch14-336'
# text_model_name = 'openai/clip-vit-base-patch32'
# model settings
model
=
dict
(
type
=
'YOLOWorldDetector'
,
mm_neck
=
True
,
num_train_classes
=
num_training_classes
,
num_test_classes
=
num_classes
,
data_preprocessor
=
dict
(
type
=
'YOLOWDetDataPreprocessor'
),
backbone
=
dict
(
_delete_
=
True
,
type
=
'MultiModalYOLOBackbone'
,
image_model
=
{{
_base_
.
model
.
backbone
}},
text_model
=
dict
(
type
=
'HuggingCLIPLanguageBackbone'
,
model_name
=
text_model_name
,
frozen_modules
=
[
'all'
])),
neck
=
dict
(
type
=
'YOLOWorldPAFPN'
,
guide_channels
=
text_channels
,
embed_channels
=
neck_embed_channels
,
num_heads
=
neck_num_heads
,
block_cfg
=
dict
(
type
=
'MaxSigmoidCSPLayerWithTwoConv'
)),
bbox_head
=
dict
(
type
=
'YOLOWorldHead'
,
head_module
=
dict
(
type
=
'YOLOWorldHeadModule'
,
use_bn_head
=
True
,
embed_dims
=
text_channels
,
num_classes
=
num_training_classes
)),
train_cfg
=
dict
(
assigner
=
dict
(
num_classes
=
num_training_classes
)))
# dataset settings
text_transform
=
[
dict
(
type
=
'RandomLoadText'
,
num_neg_samples
=
(
num_classes
,
num_classes
),
max_num_samples
=
num_training_classes
,
padding_to_max
=
True
,
padding_value
=
''
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'flip'
,
'flip_direction'
,
'texts'
))
]
train_pipeline
=
[
*
_base_
.
pre_transform
,
dict
(
type
=
'MultiModalMosaic'
,
img_scale
=
_base_
.
img_scale
,
pad_val
=
114.0
,
pre_transform
=
_base_
.
pre_transform
),
dict
(
type
=
'YOLOv5RandomAffine'
,
max_rotate_degree
=
0.0
,
max_shear_degree
=
0.0
,
scaling_ratio_range
=
(
1
-
_base_
.
affine_scale
,
1
+
_base_
.
affine_scale
),
max_aspect_ratio
=
_base_
.
max_aspect_ratio
,
border
=
(
-
_base_
.
img_scale
[
0
]
//
2
,
-
_base_
.
img_scale
[
1
]
//
2
),
border_val
=
(
114
,
114
,
114
)),
*
_base_
.
last_transform
[:
-
1
],
*
text_transform
,
]
train_pipeline_stage2
=
[
*
_base_
.
train_pipeline_stage2
[:
-
1
],
*
text_transform
]
obj365v1_train_dataset
=
dict
(
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5Objects365V1Dataset'
,
data_root
=
'data/objects365v1/'
,
ann_file
=
'annotations/objects365_train.json'
,
data_prefix
=
dict
(
img
=
'train/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
)),
class_text_path
=
'data/texts/obj365v1_class_texts.json'
,
pipeline
=
train_pipeline
)
mg_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/mixed_grounding/'
,
ann_file
=
'annotations/final_mixed_train_no_coco.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
),
pipeline
=
train_pipeline
)
flickr_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/flickr/'
,
ann_file
=
'annotations/final_flickr_separateGT_train.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
True
,
min_size
=
32
),
pipeline
=
train_pipeline
)
train_dataloader
=
dict
(
batch_size
=
train_batch_size_per_gpu
,
collate_fn
=
dict
(
type
=
'yolow_collate'
),
dataset
=
dict
(
_delete_
=
True
,
type
=
'ConcatDataset'
,
datasets
=
[
obj365v1_train_dataset
,
flickr_train_dataset
,
mg_train_dataset
],
ignore_keys
=
[
'classes'
,
'palette'
]))
test_pipeline
=
[
*
_base_
.
test_pipeline
[:
-
1
],
dict
(
type
=
'LoadText'
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'scale_factor'
,
'pad_param'
,
'texts'
))
]
coco_val_dataset
=
dict
(
_delete_
=
True
,
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5LVISV1Dataset'
,
data_root
=
'data/coco/'
,
test_mode
=
True
,
ann_file
=
'lvis/lvis_v1_minival_inserted_image_name.json'
,
data_prefix
=
dict
(
img
=
''
),
batch_shapes_cfg
=
None
),
class_text_path
=
'data/texts/lvis_v1_class_texts.json'
,
pipeline
=
test_pipeline
)
val_dataloader
=
dict
(
dataset
=
coco_val_dataset
)
test_dataloader
=
val_dataloader
val_evaluator
=
dict
(
type
=
'mmdet.LVISMetric'
,
ann_file
=
'data/coco/lvis/lvis_v1_minival_inserted_image_name.json'
,
metric
=
'bbox'
)
test_evaluator
=
val_evaluator
# training settings
default_hooks
=
dict
(
param_scheduler
=
dict
(
max_epochs
=
max_epochs
),
checkpoint
=
dict
(
interval
=
save_epoch_intervals
,
rule
=
'greater'
))
custom_hooks
=
[
dict
(
type
=
'EMAHook'
,
ema_type
=
'ExpMomentumEMA'
,
momentum
=
0.0001
,
update_buffers
=
True
,
strict_load
=
False
,
priority
=
49
),
dict
(
type
=
'mmdet.PipelineSwitchHook'
,
switch_epoch
=
max_epochs
-
close_mosaic_epochs
,
switch_pipeline
=
train_pipeline_stage2
)
]
train_cfg
=
dict
(
max_epochs
=
max_epochs
,
val_interval
=
10
,
dynamic_intervals
=
[((
max_epochs
-
close_mosaic_epochs
),
_base_
.
val_interval_stage2
)])
optim_wrapper
=
dict
(
optimizer
=
dict
(
_delete_
=
True
,
type
=
'AdamW'
,
lr
=
base_lr
,
weight_decay
=
weight_decay
,
batch_size_per_gpu
=
train_batch_size_per_gpu
),
paramwise_cfg
=
dict
(
bias_decay_mult
=
0.0
,
norm_decay_mult
=
0.0
,
custom_keys
=
{
'backbone.text_model'
:
dict
(
lr_mult
=
0.01
),
'logit_scale'
:
dict
(
weight_decay
=
0.0
)
}),
constructor
=
'YOLOWv5OptimizerConstructor'
)
configs/pretrain/yolo_world_v2_m_vlpan_bn_noeinsum_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
0 → 100644
View file @
e9cee049
_base_
=
(
'../../third_party/mmyolo/configs/yolov8/'
'yolov8_m_syncbn_fast_8xb16-500e_coco.py'
)
custom_imports
=
dict
(
imports
=
[
'yolo_world'
],
allow_failed_imports
=
False
)
# hyper-parameters
num_classes
=
1203
num_training_classes
=
80
max_epochs
=
100
# Maximum training epochs
close_mosaic_epochs
=
2
save_epoch_intervals
=
2
text_channels
=
512
neck_embed_channels
=
[
128
,
256
,
_base_
.
last_stage_out_channels
//
2
]
neck_num_heads
=
[
4
,
8
,
_base_
.
last_stage_out_channels
//
2
//
32
]
base_lr
=
2e-3
weight_decay
=
0.05
/
2
train_batch_size_per_gpu
=
16
text_model_name
=
'../pretrained_models/clip-vit-base-patch32-projection'
text_model_name
=
'openai/clip-vit-base-patch32'
# model settings
model
=
dict
(
type
=
'YOLOWorldDetector'
,
mm_neck
=
True
,
num_train_classes
=
num_training_classes
,
num_test_classes
=
num_classes
,
data_preprocessor
=
dict
(
type
=
'YOLOWDetDataPreprocessor'
),
backbone
=
dict
(
_delete_
=
True
,
type
=
'MultiModalYOLOBackbone'
,
image_model
=
{{
_base_
.
model
.
backbone
}},
text_model
=
dict
(
type
=
'HuggingCLIPLanguageBackbone'
,
model_name
=
text_model_name
,
frozen_modules
=
[
'all'
])),
neck
=
dict
(
type
=
'YOLOWorldPAFPN'
,
guide_channels
=
text_channels
,
embed_channels
=
neck_embed_channels
,
num_heads
=
neck_num_heads
,
block_cfg
=
dict
(
type
=
'MaxSigmoidCSPLayerWithTwoConv'
,
use_einsum
=
False
)),
bbox_head
=
dict
(
type
=
'YOLOWorldHead'
,
head_module
=
dict
(
type
=
'YOLOWorldHeadModule'
,
use_bn_head
=
True
,
embed_dims
=
text_channels
,
num_classes
=
num_training_classes
,
use_einsum
=
False
)),
train_cfg
=
dict
(
assigner
=
dict
(
num_classes
=
num_training_classes
)))
# dataset settings
text_transform
=
[
dict
(
type
=
'RandomLoadText'
,
num_neg_samples
=
(
num_classes
,
num_classes
),
max_num_samples
=
num_training_classes
,
padding_to_max
=
True
,
padding_value
=
''
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'flip'
,
'flip_direction'
,
'texts'
))
]
train_pipeline
=
[
*
_base_
.
pre_transform
,
dict
(
type
=
'MultiModalMosaic'
,
img_scale
=
_base_
.
img_scale
,
pad_val
=
114.0
,
pre_transform
=
_base_
.
pre_transform
),
dict
(
type
=
'YOLOv5RandomAffine'
,
max_rotate_degree
=
0.0
,
max_shear_degree
=
0.0
,
scaling_ratio_range
=
(
1
-
_base_
.
affine_scale
,
1
+
_base_
.
affine_scale
),
max_aspect_ratio
=
_base_
.
max_aspect_ratio
,
border
=
(
-
_base_
.
img_scale
[
0
]
//
2
,
-
_base_
.
img_scale
[
1
]
//
2
),
border_val
=
(
114
,
114
,
114
)),
*
_base_
.
last_transform
[:
-
1
],
*
text_transform
,
]
train_pipeline_stage2
=
[
*
_base_
.
train_pipeline_stage2
[:
-
1
],
*
text_transform
]
obj365v1_train_dataset
=
dict
(
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5Objects365V1Dataset'
,
data_root
=
'data/objects365v1/'
,
ann_file
=
'annotations/objects365_train.json'
,
data_prefix
=
dict
(
img
=
'train/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
)),
class_text_path
=
'data/texts/obj365v1_class_texts.json'
,
pipeline
=
train_pipeline
)
mg_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/mixed_grounding/'
,
ann_file
=
'annotations/final_mixed_train_no_coco.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
),
pipeline
=
train_pipeline
)
flickr_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/flickr/'
,
ann_file
=
'annotations/final_flickr_separateGT_train.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
True
,
min_size
=
32
),
pipeline
=
train_pipeline
)
train_dataloader
=
dict
(
batch_size
=
train_batch_size_per_gpu
,
collate_fn
=
dict
(
type
=
'yolow_collate'
),
dataset
=
dict
(
_delete_
=
True
,
type
=
'ConcatDataset'
,
datasets
=
[
obj365v1_train_dataset
,
flickr_train_dataset
,
mg_train_dataset
],
ignore_keys
=
[
'classes'
,
'palette'
]))
test_pipeline
=
[
*
_base_
.
test_pipeline
[:
-
1
],
dict
(
type
=
'LoadText'
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'scale_factor'
,
'pad_param'
,
'texts'
))
]
coco_val_dataset
=
dict
(
_delete_
=
True
,
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5LVISV1Dataset'
,
data_root
=
'data/coco/'
,
test_mode
=
True
,
ann_file
=
'lvis/lvis_v1_minival_inserted_image_name.json'
,
data_prefix
=
dict
(
img
=
''
),
batch_shapes_cfg
=
None
),
class_text_path
=
'data/texts/lvis_v1_class_texts.json'
,
pipeline
=
test_pipeline
)
val_dataloader
=
dict
(
dataset
=
coco_val_dataset
)
test_dataloader
=
val_dataloader
val_evaluator
=
dict
(
type
=
'mmdet.LVISMetric'
,
ann_file
=
'data/coco/lvis/lvis_v1_minival_inserted_image_name.json'
,
metric
=
'bbox'
)
test_evaluator
=
val_evaluator
# training settings
default_hooks
=
dict
(
param_scheduler
=
dict
(
max_epochs
=
max_epochs
),
checkpoint
=
dict
(
interval
=
save_epoch_intervals
,
rule
=
'greater'
))
custom_hooks
=
[
dict
(
type
=
'EMAHook'
,
ema_type
=
'ExpMomentumEMA'
,
momentum
=
0.0001
,
update_buffers
=
True
,
strict_load
=
False
,
priority
=
49
),
dict
(
type
=
'mmdet.PipelineSwitchHook'
,
switch_epoch
=
max_epochs
-
close_mosaic_epochs
,
switch_pipeline
=
train_pipeline_stage2
)
]
train_cfg
=
dict
(
max_epochs
=
max_epochs
,
val_interval
=
10
,
dynamic_intervals
=
[((
max_epochs
-
close_mosaic_epochs
),
_base_
.
val_interval_stage2
)])
optim_wrapper
=
dict
(
optimizer
=
dict
(
_delete_
=
True
,
type
=
'AdamW'
,
lr
=
base_lr
,
weight_decay
=
weight_decay
,
batch_size_per_gpu
=
train_batch_size_per_gpu
),
paramwise_cfg
=
dict
(
bias_decay_mult
=
0.0
,
norm_decay_mult
=
0.0
,
custom_keys
=
{
'backbone.text_model'
:
dict
(
lr_mult
=
0.01
),
'logit_scale'
:
dict
(
weight_decay
=
0.0
)
}),
constructor
=
'YOLOWv5OptimizerConstructor'
)
configs/pretrain/yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
0 → 100644
View file @
e9cee049
_base_
=
(
'../../third_party/mmyolo/configs/yolov8/'
'yolov8_s_syncbn_fast_8xb16-500e_coco.py'
)
custom_imports
=
dict
(
imports
=
[
'yolo_world'
],
allow_failed_imports
=
False
)
# hyper-parameters
num_classes
=
1203
num_training_classes
=
80
max_epochs
=
100
# Maximum training epochs
close_mosaic_epochs
=
2
save_epoch_intervals
=
2
text_channels
=
512
neck_embed_channels
=
[
128
,
256
,
_base_
.
last_stage_out_channels
//
2
]
neck_num_heads
=
[
4
,
8
,
_base_
.
last_stage_out_channels
//
2
//
32
]
base_lr
=
2e-4
weight_decay
=
0.025
train_batch_size_per_gpu
=
4
img_scale
=
(
1280
,
1280
)
# model settings
model
=
dict
(
type
=
'YOLOWorldDetector'
,
mm_neck
=
True
,
num_train_classes
=
num_training_classes
,
num_test_classes
=
num_classes
,
data_preprocessor
=
dict
(
type
=
'YOLOWDetDataPreprocessor'
),
backbone
=
dict
(
_delete_
=
True
,
type
=
'MultiModalYOLOBackbone'
,
image_model
=
{{
_base_
.
model
.
backbone
}},
text_model
=
dict
(
type
=
'HuggingCLIPLanguageBackbone'
,
model_name
=
'openai/clip-vit-base-patch32'
,
frozen_modules
=
[
'all'
])),
neck
=
dict
(
type
=
'YOLOWorldPAFPN'
,
guide_channels
=
text_channels
,
embed_channels
=
neck_embed_channels
,
num_heads
=
neck_num_heads
,
block_cfg
=
dict
(
type
=
'MaxSigmoidCSPLayerWithTwoConv'
)),
bbox_head
=
dict
(
type
=
'YOLOWorldHead'
,
head_module
=
dict
(
type
=
'YOLOWorldHeadModule'
,
use_bn_head
=
True
,
embed_dims
=
text_channels
,
num_classes
=
num_training_classes
)),
train_cfg
=
dict
(
assigner
=
dict
(
num_classes
=
num_training_classes
)))
# dataset settings
text_transform
=
[
dict
(
type
=
'RandomLoadText'
,
num_neg_samples
=
(
num_classes
,
num_classes
),
max_num_samples
=
num_training_classes
,
padding_to_max
=
True
,
padding_value
=
''
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'flip'
,
'flip_direction'
,
'texts'
))
]
train_pipeline
=
[
*
_base_
.
pre_transform
,
dict
(
type
=
'MultiModalMosaic'
,
img_scale
=
img_scale
,
pad_val
=
114.0
,
pre_transform
=
_base_
.
pre_transform
),
dict
(
type
=
'YOLOv5RandomAffine'
,
max_rotate_degree
=
0.0
,
max_shear_degree
=
0.0
,
scaling_ratio_range
=
(
1
-
_base_
.
affine_scale
,
1
+
_base_
.
affine_scale
),
max_aspect_ratio
=
_base_
.
max_aspect_ratio
,
border
=
(
-
img_scale
[
0
]
//
2
,
-
img_scale
[
1
]
//
2
),
border_val
=
(
114
,
114
,
114
)),
*
_base_
.
last_transform
[:
-
1
],
*
text_transform
,
]
train_pipeline_stage2
=
[
*
_base_
.
pre_transform
,
dict
(
type
=
'YOLOv5KeepRatioResize'
,
scale
=
img_scale
),
dict
(
type
=
'LetterResize'
,
scale
=
img_scale
,
allow_scale_up
=
True
,
pad_val
=
dict
(
img
=
114.0
)),
dict
(
type
=
'YOLOv5RandomAffine'
,
max_rotate_degree
=
0.0
,
max_shear_degree
=
0.0
,
scaling_ratio_range
=
(
1
-
_base_
.
affine_scale
,
1
+
_base_
.
affine_scale
),
max_aspect_ratio
=
_base_
.
max_aspect_ratio
,
border_val
=
(
114
,
114
,
114
)),
*
_base_
.
last_transform
[:
-
1
],
*
text_transform
]
obj365v1_train_dataset
=
dict
(
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5Objects365V1Dataset'
,
data_root
=
'data/objects365v1/'
,
ann_file
=
'annotations/objects365_train.json'
,
data_prefix
=
dict
(
img
=
'train/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
)),
class_text_path
=
'data/texts/obj365v1_class_texts.json'
,
pipeline
=
train_pipeline
)
mg_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/mixed_grounding/'
,
ann_file
=
'annotations/final_mixed_train_no_coco.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
),
pipeline
=
train_pipeline
)
flickr_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/flickr/'
,
ann_file
=
'annotations/final_flickr_separateGT_train.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
True
,
min_size
=
32
),
pipeline
=
train_pipeline
)
train_dataloader
=
dict
(
batch_size
=
train_batch_size_per_gpu
,
collate_fn
=
dict
(
type
=
'yolow_collate'
),
dataset
=
dict
(
_delete_
=
True
,
type
=
'ConcatDataset'
,
datasets
=
[
obj365v1_train_dataset
,
flickr_train_dataset
,
mg_train_dataset
],
ignore_keys
=
[
'classes'
,
'palette'
]))
test_pipeline
=
[
dict
(
type
=
'LoadImageFromFile'
),
dict
(
type
=
'YOLOv5KeepRatioResize'
,
scale
=
img_scale
),
dict
(
type
=
'LetterResize'
,
scale
=
img_scale
,
allow_scale_up
=
False
,
pad_val
=
dict
(
img
=
114
)),
dict
(
type
=
'LoadAnnotations'
,
with_bbox
=
True
,
_scope_
=
'mmdet'
),
dict
(
type
=
'LoadText'
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'scale_factor'
,
'pad_param'
,
'texts'
))
]
coco_val_dataset
=
dict
(
_delete_
=
True
,
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5LVISV1Dataset'
,
data_root
=
'data/coco/'
,
test_mode
=
True
,
ann_file
=
'lvis/lvis_v1_minival_inserted_image_name.json'
,
data_prefix
=
dict
(
img
=
''
),
batch_shapes_cfg
=
None
),
class_text_path
=
'data/texts/lvis_v1_class_texts.json'
,
pipeline
=
test_pipeline
)
val_dataloader
=
dict
(
dataset
=
coco_val_dataset
)
test_dataloader
=
val_dataloader
val_evaluator
=
dict
(
type
=
'mmdet.LVISMetric'
,
ann_file
=
'data/coco/lvis/lvis_v1_minival_inserted_image_name.json'
,
metric
=
'bbox'
)
test_evaluator
=
val_evaluator
# training settings
default_hooks
=
dict
(
param_scheduler
=
dict
(
max_epochs
=
max_epochs
),
checkpoint
=
dict
(
interval
=
save_epoch_intervals
,
rule
=
'greater'
))
custom_hooks
=
[
dict
(
type
=
'EMAHook'
,
ema_type
=
'ExpMomentumEMA'
,
momentum
=
0.0001
,
update_buffers
=
True
,
strict_load
=
False
,
priority
=
49
),
dict
(
type
=
'mmdet.PipelineSwitchHook'
,
switch_epoch
=
max_epochs
-
close_mosaic_epochs
,
switch_pipeline
=
train_pipeline_stage2
)
]
train_cfg
=
dict
(
max_epochs
=
max_epochs
,
val_interval
=
10
,
dynamic_intervals
=
[((
max_epochs
-
close_mosaic_epochs
),
_base_
.
val_interval_stage2
)])
optim_wrapper
=
dict
(
optimizer
=
dict
(
_delete_
=
True
,
type
=
'AdamW'
,
lr
=
base_lr
,
weight_decay
=
weight_decay
,
batch_size_per_gpu
=
train_batch_size_per_gpu
),
paramwise_cfg
=
dict
(
bias_decay_mult
=
0.0
,
norm_decay_mult
=
0.0
,
custom_keys
=
{
'backbone.text_model'
:
dict
(
lr_mult
=
0.01
),
'logit_scale'
:
dict
(
weight_decay
=
0.0
)
}),
constructor
=
'YOLOWv5OptimizerConstructor'
)
configs/pretrain/yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
0 → 100644
View file @
e9cee049
_base_
=
(
'../../third_party/mmyolo/configs/yolov8/'
'yolov8_s_syncbn_fast_8xb16-500e_coco.py'
)
custom_imports
=
dict
(
imports
=
[
'yolo_world'
],
allow_failed_imports
=
False
)
# hyper-parameters
num_classes
=
1203
num_training_classes
=
80
max_epochs
=
100
# Maximum training epochs
close_mosaic_epochs
=
2
save_epoch_intervals
=
2
text_channels
=
512
neck_embed_channels
=
[
128
,
256
,
_base_
.
last_stage_out_channels
//
2
]
neck_num_heads
=
[
4
,
8
,
_base_
.
last_stage_out_channels
//
2
//
32
]
base_lr
=
2e-3
weight_decay
=
0.05
/
2
train_batch_size_per_gpu
=
16
# model settings
model
=
dict
(
type
=
'YOLOWorldDetector'
,
mm_neck
=
True
,
num_train_classes
=
num_training_classes
,
num_test_classes
=
num_classes
,
data_preprocessor
=
dict
(
type
=
'YOLOWDetDataPreprocessor'
),
backbone
=
dict
(
_delete_
=
True
,
type
=
'MultiModalYOLOBackbone'
,
image_model
=
{{
_base_
.
model
.
backbone
}},
text_model
=
dict
(
type
=
'HuggingCLIPLanguageBackbone'
,
model_name
=
'openai/clip-vit-base-patch32'
,
frozen_modules
=
[
'all'
])),
neck
=
dict
(
type
=
'YOLOWorldPAFPN'
,
guide_channels
=
text_channels
,
embed_channels
=
neck_embed_channels
,
num_heads
=
neck_num_heads
,
block_cfg
=
dict
(
type
=
'MaxSigmoidCSPLayerWithTwoConv'
)),
bbox_head
=
dict
(
type
=
'YOLOWorldHead'
,
head_module
=
dict
(
type
=
'YOLOWorldHeadModule'
,
use_bn_head
=
True
,
embed_dims
=
text_channels
,
num_classes
=
num_training_classes
)),
train_cfg
=
dict
(
assigner
=
dict
(
num_classes
=
num_training_classes
)))
# dataset settings
text_transform
=
[
dict
(
type
=
'RandomLoadText'
,
num_neg_samples
=
(
num_classes
,
num_classes
),
max_num_samples
=
num_training_classes
,
padding_to_max
=
True
,
padding_value
=
''
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'flip'
,
'flip_direction'
,
'texts'
))
]
train_pipeline
=
[
*
_base_
.
pre_transform
,
dict
(
type
=
'MultiModalMosaic'
,
img_scale
=
_base_
.
img_scale
,
pad_val
=
114.0
,
pre_transform
=
_base_
.
pre_transform
),
dict
(
type
=
'YOLOv5RandomAffine'
,
max_rotate_degree
=
0.0
,
max_shear_degree
=
0.0
,
scaling_ratio_range
=
(
1
-
_base_
.
affine_scale
,
1
+
_base_
.
affine_scale
),
max_aspect_ratio
=
_base_
.
max_aspect_ratio
,
border
=
(
-
_base_
.
img_scale
[
0
]
//
2
,
-
_base_
.
img_scale
[
1
]
//
2
),
border_val
=
(
114
,
114
,
114
)),
*
_base_
.
last_transform
[:
-
1
],
*
text_transform
,
]
train_pipeline_stage2
=
[
*
_base_
.
train_pipeline_stage2
[:
-
1
],
*
text_transform
]
obj365v1_train_dataset
=
dict
(
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5Objects365V1Dataset'
,
data_root
=
'data/objects365v1/'
,
ann_file
=
'annotations/objects365_train.json'
,
data_prefix
=
dict
(
img
=
'train/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
)),
class_text_path
=
'data/texts/obj365v1_class_texts.json'
,
pipeline
=
train_pipeline
)
mg_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/mixed_grounding/'
,
ann_file
=
'annotations/final_mixed_train_no_coco.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
),
pipeline
=
train_pipeline
)
flickr_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/flickr/'
,
ann_file
=
'annotations/final_flickr_separateGT_train.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
True
,
min_size
=
32
),
pipeline
=
train_pipeline
)
train_dataloader
=
dict
(
batch_size
=
train_batch_size_per_gpu
,
collate_fn
=
dict
(
type
=
'yolow_collate'
),
dataset
=
dict
(
_delete_
=
True
,
type
=
'ConcatDataset'
,
datasets
=
[
obj365v1_train_dataset
,
flickr_train_dataset
,
mg_train_dataset
],
ignore_keys
=
[
'classes'
,
'palette'
]))
test_pipeline
=
[
*
_base_
.
test_pipeline
[:
-
1
],
dict
(
type
=
'LoadText'
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'scale_factor'
,
'pad_param'
,
'texts'
))
]
coco_val_dataset
=
dict
(
_delete_
=
True
,
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5LVISV1Dataset'
,
data_root
=
'data/coco/'
,
test_mode
=
True
,
ann_file
=
'lvis/lvis_v1_minival_inserted_image_name.json'
,
data_prefix
=
dict
(
img
=
''
),
batch_shapes_cfg
=
None
),
class_text_path
=
'data/texts/lvis_v1_class_texts.json'
,
pipeline
=
test_pipeline
)
val_dataloader
=
dict
(
dataset
=
coco_val_dataset
)
test_dataloader
=
val_dataloader
val_evaluator
=
dict
(
type
=
'mmdet.LVISMetric'
,
ann_file
=
'data/coco/lvis/lvis_v1_minival_inserted_image_name.json'
,
metric
=
'bbox'
)
test_evaluator
=
val_evaluator
# training settings
default_hooks
=
dict
(
param_scheduler
=
dict
(
max_epochs
=
max_epochs
),
checkpoint
=
dict
(
interval
=
save_epoch_intervals
,
rule
=
'greater'
))
custom_hooks
=
[
dict
(
type
=
'EMAHook'
,
ema_type
=
'ExpMomentumEMA'
,
momentum
=
0.0001
,
update_buffers
=
True
,
strict_load
=
False
,
priority
=
49
),
dict
(
type
=
'mmdet.PipelineSwitchHook'
,
switch_epoch
=
max_epochs
-
close_mosaic_epochs
,
switch_pipeline
=
train_pipeline_stage2
)
]
train_cfg
=
dict
(
max_epochs
=
max_epochs
,
val_interval
=
10
,
dynamic_intervals
=
[((
max_epochs
-
close_mosaic_epochs
),
_base_
.
val_interval_stage2
)])
optim_wrapper
=
dict
(
optimizer
=
dict
(
_delete_
=
True
,
type
=
'AdamW'
,
lr
=
base_lr
,
weight_decay
=
weight_decay
,
batch_size_per_gpu
=
train_batch_size_per_gpu
),
paramwise_cfg
=
dict
(
bias_decay_mult
=
0.0
,
norm_decay_mult
=
0.0
,
custom_keys
=
{
'backbone.text_model'
:
dict
(
lr_mult
=
0.01
),
'logit_scale'
:
dict
(
weight_decay
=
0.0
)
}),
constructor
=
'YOLOWv5OptimizerConstructor'
)
configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
0 → 100644
View file @
e9cee049
_base_
=
(
'../../third_party/mmyolo/configs/yolov8/'
'yolov8_x_syncbn_fast_8xb16-500e_coco.py'
)
custom_imports
=
dict
(
imports
=
[
'yolo_world'
],
allow_failed_imports
=
False
)
# hyper-parameters
num_classes
=
1203
num_training_classes
=
80
max_epochs
=
100
# Maximum training epochs
close_mosaic_epochs
=
2
save_epoch_intervals
=
2
text_channels
=
512
neck_embed_channels
=
[
128
,
256
,
_base_
.
last_stage_out_channels
//
2
]
neck_num_heads
=
[
4
,
8
,
_base_
.
last_stage_out_channels
//
2
//
32
]
base_lr
=
2e-3
weight_decay
=
0.05
/
2
train_batch_size_per_gpu
=
16
# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
text_model_name
=
'openai/clip-vit-base-patch32'
img_scale
=
(
1280
,
1280
)
# model settings
model
=
dict
(
type
=
'YOLOWorldDetector'
,
mm_neck
=
True
,
num_train_classes
=
num_training_classes
,
num_test_classes
=
num_classes
,
data_preprocessor
=
dict
(
type
=
'YOLOWDetDataPreprocessor'
),
backbone
=
dict
(
_delete_
=
True
,
type
=
'MultiModalYOLOBackbone'
,
image_model
=
{{
_base_
.
model
.
backbone
}},
text_model
=
dict
(
type
=
'HuggingCLIPLanguageBackbone'
,
model_name
=
text_model_name
,
frozen_modules
=
[
'all'
])),
neck
=
dict
(
type
=
'YOLOWorldPAFPN'
,
guide_channels
=
text_channels
,
embed_channels
=
neck_embed_channels
,
num_heads
=
neck_num_heads
,
block_cfg
=
dict
(
type
=
'MaxSigmoidCSPLayerWithTwoConv'
)),
bbox_head
=
dict
(
type
=
'YOLOWorldHead'
,
head_module
=
dict
(
type
=
'YOLOWorldHeadModule'
,
use_bn_head
=
True
,
embed_dims
=
text_channels
,
num_classes
=
num_training_classes
)),
train_cfg
=
dict
(
assigner
=
dict
(
num_classes
=
num_training_classes
)))
# dataset settings
text_transform
=
[
dict
(
type
=
'RandomLoadText'
,
num_neg_samples
=
(
num_classes
,
num_classes
),
max_num_samples
=
num_training_classes
,
padding_to_max
=
True
,
padding_value
=
''
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'flip'
,
'flip_direction'
,
'texts'
))
]
train_pipeline
=
[
*
_base_
.
pre_transform
,
dict
(
type
=
'MultiModalMosaic'
,
img_scale
=
img_scale
,
pad_val
=
114.0
,
pre_transform
=
_base_
.
pre_transform
),
dict
(
type
=
'YOLOv5RandomAffine'
,
max_rotate_degree
=
0.0
,
max_shear_degree
=
0.0
,
scaling_ratio_range
=
(
1
-
_base_
.
affine_scale
,
1
+
_base_
.
affine_scale
),
max_aspect_ratio
=
_base_
.
max_aspect_ratio
,
border
=
(
-
img_scale
[
0
]
//
2
,
-
img_scale
[
1
]
//
2
),
border_val
=
(
114
,
114
,
114
)),
*
_base_
.
last_transform
[:
-
1
],
*
text_transform
,
]
train_pipeline_stage2
=
[
*
_base_
.
pre_transform
,
dict
(
type
=
'YOLOv5KeepRatioResize'
,
scale
=
img_scale
),
dict
(
type
=
'LetterResize'
,
scale
=
img_scale
,
allow_scale_up
=
True
,
pad_val
=
dict
(
img
=
114.0
)),
dict
(
type
=
'YOLOv5RandomAffine'
,
max_rotate_degree
=
0.0
,
max_shear_degree
=
0.0
,
scaling_ratio_range
=
(
1
-
_base_
.
affine_scale
,
1
+
_base_
.
affine_scale
),
max_aspect_ratio
=
_base_
.
max_aspect_ratio
,
border_val
=
(
114
,
114
,
114
)),
*
_base_
.
last_transform
[:
-
1
],
*
text_transform
]
obj365v1_train_dataset
=
dict
(
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5Objects365V1Dataset'
,
data_root
=
'data/objects365v1/'
,
ann_file
=
'annotations/objects365_train.json'
,
data_prefix
=
dict
(
img
=
'train/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
)),
class_text_path
=
'data/texts/obj365v1_class_texts.json'
,
pipeline
=
train_pipeline
)
mg_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/mixed_grounding/'
,
ann_file
=
'annotations/final_mixed_train_no_coco.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
),
pipeline
=
train_pipeline
)
flickr_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/flickr/'
,
ann_file
=
'annotations/final_flickr_separateGT_train.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
True
,
min_size
=
32
),
pipeline
=
train_pipeline
)
train_dataloader
=
dict
(
batch_size
=
train_batch_size_per_gpu
,
collate_fn
=
dict
(
type
=
'yolow_collate'
),
dataset
=
dict
(
_delete_
=
True
,
type
=
'ConcatDataset'
,
datasets
=
[
obj365v1_train_dataset
,
flickr_train_dataset
,
mg_train_dataset
],
ignore_keys
=
[
'classes'
,
'palette'
]))
test_pipeline
=
[
dict
(
type
=
'LoadImageFromFile'
),
dict
(
type
=
'YOLOv5KeepRatioResize'
,
scale
=
img_scale
),
dict
(
type
=
'LetterResize'
,
scale
=
img_scale
,
allow_scale_up
=
False
,
pad_val
=
dict
(
img
=
114
)),
dict
(
type
=
'LoadAnnotations'
,
with_bbox
=
True
,
_scope_
=
'mmdet'
),
dict
(
type
=
'LoadText'
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'scale_factor'
,
'pad_param'
,
'texts'
))
]
coco_val_dataset
=
dict
(
_delete_
=
True
,
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5LVISV1Dataset'
,
data_root
=
'data/coco/'
,
test_mode
=
True
,
ann_file
=
'lvis/lvis_v1_minival_inserted_image_name.json'
,
data_prefix
=
dict
(
img
=
''
),
batch_shapes_cfg
=
None
),
class_text_path
=
'data/texts/lvis_v1_class_texts.json'
,
pipeline
=
test_pipeline
)
val_dataloader
=
dict
(
dataset
=
coco_val_dataset
)
test_dataloader
=
val_dataloader
val_evaluator
=
dict
(
type
=
'mmdet.LVISMetric'
,
ann_file
=
'data/coco/lvis/lvis_v1_minival_inserted_image_name.json'
,
metric
=
'bbox'
)
test_evaluator
=
val_evaluator
# training settings
default_hooks
=
dict
(
param_scheduler
=
dict
(
max_epochs
=
max_epochs
),
checkpoint
=
dict
(
interval
=
save_epoch_intervals
,
rule
=
'greater'
))
custom_hooks
=
[
dict
(
type
=
'EMAHook'
,
ema_type
=
'ExpMomentumEMA'
,
momentum
=
0.0001
,
update_buffers
=
True
,
strict_load
=
False
,
priority
=
49
),
dict
(
type
=
'mmdet.PipelineSwitchHook'
,
switch_epoch
=
max_epochs
-
close_mosaic_epochs
,
switch_pipeline
=
train_pipeline_stage2
)
]
train_cfg
=
dict
(
max_epochs
=
max_epochs
,
val_interval
=
10
,
dynamic_intervals
=
[((
max_epochs
-
close_mosaic_epochs
),
_base_
.
val_interval_stage2
)])
optim_wrapper
=
dict
(
optimizer
=
dict
(
_delete_
=
True
,
type
=
'AdamW'
,
lr
=
base_lr
,
weight_decay
=
weight_decay
,
batch_size_per_gpu
=
train_batch_size_per_gpu
),
paramwise_cfg
=
dict
(
bias_decay_mult
=
0.0
,
norm_decay_mult
=
0.0
,
custom_keys
=
{
'backbone.text_model'
:
dict
(
lr_mult
=
0.01
),
'logit_scale'
:
dict
(
weight_decay
=
0.0
)
}),
constructor
=
'YOLOWv5OptimizerConstructor'
)
configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
0 → 100644
View file @
e9cee049
_base_
=
(
'../../third_party/mmyolo/configs/yolov8/'
'yolov8_x_syncbn_fast_8xb16-500e_coco.py'
)
custom_imports
=
dict
(
imports
=
[
'yolo_world'
],
allow_failed_imports
=
False
)
# hyper-parameters
num_classes
=
1203
num_training_classes
=
80
max_epochs
=
100
# Maximum training epochs
close_mosaic_epochs
=
2
save_epoch_intervals
=
2
text_channels
=
512
neck_embed_channels
=
[
128
,
256
,
_base_
.
last_stage_out_channels
//
2
]
neck_num_heads
=
[
4
,
8
,
_base_
.
last_stage_out_channels
//
2
//
32
]
base_lr
=
2e-3
weight_decay
=
0.05
/
2
train_batch_size_per_gpu
=
16
# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
text_model_name
=
'openai/clip-vit-base-patch32'
# model settings
model
=
dict
(
type
=
'YOLOWorldDetector'
,
mm_neck
=
True
,
num_train_classes
=
num_training_classes
,
num_test_classes
=
num_classes
,
data_preprocessor
=
dict
(
type
=
'YOLOWDetDataPreprocessor'
),
backbone
=
dict
(
_delete_
=
True
,
type
=
'MultiModalYOLOBackbone'
,
image_model
=
{{
_base_
.
model
.
backbone
}},
text_model
=
dict
(
type
=
'HuggingCLIPLanguageBackbone'
,
model_name
=
text_model_name
,
frozen_modules
=
[
'all'
])),
neck
=
dict
(
type
=
'YOLOWorldPAFPN'
,
guide_channels
=
text_channels
,
embed_channels
=
neck_embed_channels
,
num_heads
=
neck_num_heads
,
block_cfg
=
dict
(
type
=
'MaxSigmoidCSPLayerWithTwoConv'
)),
bbox_head
=
dict
(
type
=
'YOLOWorldHead'
,
head_module
=
dict
(
type
=
'YOLOWorldHeadModule'
,
use_bn_head
=
True
,
embed_dims
=
text_channels
,
num_classes
=
num_training_classes
)),
train_cfg
=
dict
(
assigner
=
dict
(
num_classes
=
num_training_classes
)))
# dataset settings
text_transform
=
[
dict
(
type
=
'RandomLoadText'
,
num_neg_samples
=
(
num_classes
,
num_classes
),
max_num_samples
=
num_training_classes
,
padding_to_max
=
True
,
padding_value
=
''
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'flip'
,
'flip_direction'
,
'texts'
))
]
train_pipeline
=
[
*
_base_
.
pre_transform
,
dict
(
type
=
'MultiModalMosaic'
,
img_scale
=
_base_
.
img_scale
,
pad_val
=
114.0
,
pre_transform
=
_base_
.
pre_transform
),
dict
(
type
=
'YOLOv5RandomAffine'
,
max_rotate_degree
=
0.0
,
max_shear_degree
=
0.0
,
scaling_ratio_range
=
(
1
-
_base_
.
affine_scale
,
1
+
_base_
.
affine_scale
),
max_aspect_ratio
=
_base_
.
max_aspect_ratio
,
border
=
(
-
_base_
.
img_scale
[
0
]
//
2
,
-
_base_
.
img_scale
[
1
]
//
2
),
border_val
=
(
114
,
114
,
114
)),
*
_base_
.
last_transform
[:
-
1
],
*
text_transform
,
]
train_pipeline_stage2
=
[
*
_base_
.
train_pipeline_stage2
[:
-
1
],
*
text_transform
]
obj365v1_train_dataset
=
dict
(
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5Objects365V1Dataset'
,
data_root
=
'data/objects365v1/'
,
ann_file
=
'annotations/objects365_train.json'
,
data_prefix
=
dict
(
img
=
'train/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
)),
class_text_path
=
'data/texts/obj365v1_class_texts.json'
,
pipeline
=
train_pipeline
)
mg_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/mixed_grounding/'
,
ann_file
=
'annotations/final_mixed_train_no_coco.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
),
pipeline
=
train_pipeline
)
flickr_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/flickr/'
,
ann_file
=
'annotations/final_flickr_separateGT_train.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
True
,
min_size
=
32
),
pipeline
=
train_pipeline
)
train_dataloader
=
dict
(
batch_size
=
train_batch_size_per_gpu
,
collate_fn
=
dict
(
type
=
'yolow_collate'
),
dataset
=
dict
(
_delete_
=
True
,
type
=
'ConcatDataset'
,
datasets
=
[
obj365v1_train_dataset
,
flickr_train_dataset
,
mg_train_dataset
],
ignore_keys
=
[
'classes'
,
'palette'
]))
test_pipeline
=
[
*
_base_
.
test_pipeline
[:
-
1
],
dict
(
type
=
'LoadText'
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'scale_factor'
,
'pad_param'
,
'texts'
))
]
coco_val_dataset
=
dict
(
_delete_
=
True
,
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5LVISV1Dataset'
,
data_root
=
'data/coco/'
,
test_mode
=
True
,
ann_file
=
'lvis/lvis_v1_minival_inserted_image_name.json'
,
data_prefix
=
dict
(
img
=
''
),
batch_shapes_cfg
=
None
),
class_text_path
=
'data/texts/lvis_v1_class_texts.json'
,
pipeline
=
test_pipeline
)
val_dataloader
=
dict
(
dataset
=
coco_val_dataset
)
test_dataloader
=
val_dataloader
val_evaluator
=
dict
(
type
=
'mmdet.LVISMetric'
,
ann_file
=
'data/coco/lvis/lvis_v1_minival_inserted_image_name.json'
,
metric
=
'bbox'
)
test_evaluator
=
val_evaluator
# training settings
default_hooks
=
dict
(
param_scheduler
=
dict
(
max_epochs
=
max_epochs
),
checkpoint
=
dict
(
interval
=
save_epoch_intervals
,
rule
=
'greater'
))
custom_hooks
=
[
dict
(
type
=
'EMAHook'
,
ema_type
=
'ExpMomentumEMA'
,
momentum
=
0.0001
,
update_buffers
=
True
,
strict_load
=
False
,
priority
=
49
),
dict
(
type
=
'mmdet.PipelineSwitchHook'
,
switch_epoch
=
max_epochs
-
close_mosaic_epochs
,
switch_pipeline
=
train_pipeline_stage2
)
]
train_cfg
=
dict
(
max_epochs
=
max_epochs
,
val_interval
=
10
,
dynamic_intervals
=
[((
max_epochs
-
close_mosaic_epochs
),
_base_
.
val_interval_stage2
)])
optim_wrapper
=
dict
(
optimizer
=
dict
(
_delete_
=
True
,
type
=
'AdamW'
,
lr
=
base_lr
,
weight_decay
=
weight_decay
,
batch_size_per_gpu
=
train_batch_size_per_gpu
),
paramwise_cfg
=
dict
(
bias_decay_mult
=
0.0
,
norm_decay_mult
=
0.0
,
custom_keys
=
{
'backbone.text_model'
:
dict
(
lr_mult
=
0.01
),
'logit_scale'
:
dict
(
weight_decay
=
0.0
)
}),
constructor
=
'YOLOWv5OptimizerConstructor'
)
configs/pretrain/yolo_world_v2_xl_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
0 → 100644
View file @
e9cee049
_base_
=
(
'../../third_party/mmyolo/configs/yolov8/'
'yolov8_x_syncbn_fast_8xb16-500e_coco.py'
)
custom_imports
=
dict
(
imports
=
[
'yolo_world'
],
allow_failed_imports
=
False
)
# hyper-parameters
num_classes
=
1203
num_training_classes
=
80
max_epochs
=
100
# Maximum training epochs
close_mosaic_epochs
=
2
save_epoch_intervals
=
2
text_channels
=
512
neck_embed_channels
=
[
128
,
256
,
_base_
.
last_stage_out_channels
//
2
]
neck_num_heads
=
[
4
,
8
,
_base_
.
last_stage_out_channels
//
2
//
32
]
base_lr
=
2e-3
weight_decay
=
0.05
/
2
train_batch_size_per_gpu
=
16
text_model_name
=
'../pretrained_models/clip-vit-base-patch32-projection'
text_model_name
=
'openai/clip-vit-base-patch32'
# scaling model from X to XL
deepen_factor
=
1.0
widen_factor
=
1.5
backbone
=
_base_
.
model
.
backbone
backbone
.
update
(
deepen_factor
=
deepen_factor
,
widen_factor
=
widen_factor
)
# model settings
model
=
dict
(
type
=
'YOLOWorldDetector'
,
mm_neck
=
True
,
num_train_classes
=
num_training_classes
,
num_test_classes
=
num_classes
,
data_preprocessor
=
dict
(
type
=
'YOLOWDetDataPreprocessor'
),
backbone
=
dict
(
_delete_
=
True
,
type
=
'MultiModalYOLOBackbone'
,
image_model
=
backbone
,
text_model
=
dict
(
type
=
'HuggingCLIPLanguageBackbone'
,
model_name
=
text_model_name
,
frozen_modules
=
[
'all'
])),
neck
=
dict
(
type
=
'YOLOWorldPAFPN'
,
deepen_factor
=
deepen_factor
,
widen_factor
=
widen_factor
,
guide_channels
=
text_channels
,
embed_channels
=
neck_embed_channels
,
num_heads
=
neck_num_heads
,
block_cfg
=
dict
(
type
=
'MaxSigmoidCSPLayerWithTwoConv'
)),
bbox_head
=
dict
(
type
=
'YOLOWorldHead'
,
head_module
=
dict
(
type
=
'YOLOWorldHeadModule'
,
widen_factor
=
widen_factor
,
use_bn_head
=
True
,
embed_dims
=
text_channels
,
num_classes
=
num_training_classes
)),
train_cfg
=
dict
(
assigner
=
dict
(
num_classes
=
num_training_classes
)))
# dataset settings
text_transform
=
[
dict
(
type
=
'RandomLoadText'
,
num_neg_samples
=
(
num_classes
,
num_classes
),
max_num_samples
=
num_training_classes
,
padding_to_max
=
True
,
padding_value
=
''
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'flip'
,
'flip_direction'
,
'texts'
))
]
train_pipeline
=
[
*
_base_
.
pre_transform
,
dict
(
type
=
'MultiModalMosaic'
,
img_scale
=
_base_
.
img_scale
,
pad_val
=
114.0
,
pre_transform
=
_base_
.
pre_transform
),
dict
(
type
=
'YOLOv5RandomAffine'
,
max_rotate_degree
=
0.0
,
max_shear_degree
=
0.0
,
scaling_ratio_range
=
(
1
-
_base_
.
affine_scale
,
1
+
_base_
.
affine_scale
),
max_aspect_ratio
=
_base_
.
max_aspect_ratio
,
border
=
(
-
_base_
.
img_scale
[
0
]
//
2
,
-
_base_
.
img_scale
[
1
]
//
2
),
border_val
=
(
114
,
114
,
114
)),
*
_base_
.
last_transform
[:
-
1
],
*
text_transform
,
]
train_pipeline_stage2
=
[
*
_base_
.
train_pipeline_stage2
[:
-
1
],
*
text_transform
]
obj365v1_train_dataset
=
dict
(
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5Objects365V1Dataset'
,
data_root
=
'data/objects365v1/'
,
ann_file
=
'annotations/objects365_train.json'
,
data_prefix
=
dict
(
img
=
'train/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
)),
class_text_path
=
'data/texts/obj365v1_class_texts.json'
,
pipeline
=
train_pipeline
)
mg_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/mixed_grounding/'
,
ann_file
=
'annotations/final_mixed_train_no_coco.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
),
pipeline
=
train_pipeline
)
flickr_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/flickr/'
,
ann_file
=
'annotations/final_flickr_separateGT_train.json'
,
data_prefix
=
dict
(
img
=
'images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
True
,
min_size
=
32
),
pipeline
=
train_pipeline
)
train_dataloader
=
dict
(
batch_size
=
train_batch_size_per_gpu
,
collate_fn
=
dict
(
type
=
'yolow_collate'
),
dataset
=
dict
(
_delete_
=
True
,
type
=
'ConcatDataset'
,
datasets
=
[
obj365v1_train_dataset
,
flickr_train_dataset
,
mg_train_dataset
],
ignore_keys
=
[
'classes'
,
'palette'
]))
test_pipeline
=
[
*
_base_
.
test_pipeline
[:
-
1
],
dict
(
type
=
'LoadText'
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'scale_factor'
,
'pad_param'
,
'texts'
))
]
coco_val_dataset
=
dict
(
_delete_
=
True
,
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5LVISV1Dataset'
,
data_root
=
'data/coco/'
,
test_mode
=
True
,
ann_file
=
'lvis/lvis_v1_minival_inserted_image_name.json'
,
data_prefix
=
dict
(
img
=
''
),
batch_shapes_cfg
=
None
),
class_text_path
=
'data/texts/lvis_v1_class_texts.json'
,
pipeline
=
test_pipeline
)
val_dataloader
=
dict
(
dataset
=
coco_val_dataset
)
test_dataloader
=
val_dataloader
val_evaluator
=
dict
(
type
=
'mmdet.LVISMetric'
,
ann_file
=
'data/coco/lvis/lvis_v1_minival_inserted_image_name.json'
,
metric
=
'bbox'
)
test_evaluator
=
val_evaluator
# training settings
default_hooks
=
dict
(
param_scheduler
=
dict
(
max_epochs
=
max_epochs
),
checkpoint
=
dict
(
interval
=
save_epoch_intervals
,
rule
=
'greater'
))
custom_hooks
=
[
dict
(
type
=
'EMAHook'
,
ema_type
=
'ExpMomentumEMA'
,
momentum
=
0.0001
,
update_buffers
=
True
,
strict_load
=
False
,
priority
=
49
),
dict
(
type
=
'mmdet.PipelineSwitchHook'
,
switch_epoch
=
max_epochs
-
close_mosaic_epochs
,
switch_pipeline
=
train_pipeline_stage2
)
]
train_cfg
=
dict
(
max_epochs
=
max_epochs
,
val_interval
=
10
,
dynamic_intervals
=
[((
max_epochs
-
close_mosaic_epochs
),
_base_
.
val_interval_stage2
)])
optim_wrapper
=
dict
(
optimizer
=
dict
(
_delete_
=
True
,
type
=
'AdamW'
,
lr
=
base_lr
,
weight_decay
=
weight_decay
,
batch_size_per_gpu
=
train_batch_size_per_gpu
),
paramwise_cfg
=
dict
(
bias_decay_mult
=
0.0
,
norm_decay_mult
=
0.0
,
custom_keys
=
{
'backbone.text_model'
:
dict
(
lr_mult
=
0.01
),
'logit_scale'
:
dict
(
weight_decay
=
0.0
)
}),
constructor
=
'YOLOWv5OptimizerConstructor'
)
configs/pretrain_v1/README.md
0 → 100644
View file @
e9cee049
## Pre-training YOLO-World-v1
> The YOLO-World-v1 is an initial version, and now is nearly deprecated! We strongly suggest you use the [latest version](../pretrain/).
### Zero-shot Inference on LVIS dataset
| model | Pre-train Data | Size | AP
<sup>
mini
</su>
| AP
<sub>
r
</sub>
| AP
<sub>
c
</sub>
| AP
<sub>
f
</sub>
| AP
<sup>
val
</su>
| AP
<sub>
r
</sub>
| AP
<sub>
c
</sub>
| AP
<sub>
f
</sub>
| weights |
| :------------------------------------------------------------------------------------------------------------------- | :------------------- | :----------------- | :--------------: | :------------: | :------------: | :------------: | :-------------: | :------------: | :------------: | :------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
|
[
YOLO-World-S
](
./yolo_world_s_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
)
| O365+GoldG | 640 | 24.3 | 16.6 | 22.1 | 27.7 | 17.8 | 11.0 | 14.8 | 24.0 |
[
HF Checkpoints 🤗
](
https://huggingface.co/wondervictor/YOLO-World/resolve/main/yolo_world_s_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-18bea4d2.pth
)
|
|
[
YOLO-World-M
](
./yolo_world_m_dual_l2norm_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
)
| O365+GoldG | 640 | 28.6 | 19.7 | 26.6 | 31.9 | 22.3 | 16.2 | 19.0 | 28.7 |
[
HF Checkpoints 🤗
](
https://huggingface.co/wondervictor/YOLO-World/resolve/main/yolo_world_m_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-2b7bd1be.pth
)
|
|
[
YOLO-World-L
](
./yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
)
| O365+GoldG | 640 | 32.5 | 22.3 | 30.6 | 36.1 | 24.8 | 17.8 | 22.4 | 32.5 |
[
HF Checkpoints 🤗
](
https://huggingface.co/wondervictor/YOLO-World/resolve/main/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth
)
|
|
[
YOLO-World-L
](
./yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
)
| O365+GoldG+CC3M-Lite | 640 | 33.0 | 23.6 | 32.0 | 35.5 | 25.3 | 18.0 | 22.1 | 32.1 |
[
HF Checkpoints 🤗
](
https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_cc3mlite_train_pretrained-7a5eea3b.pth
)
|
|
[
YOLO-World-X
](
./yolo_world_x_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
)
| O365+GoldG+CC3M-Lite | 640 | 33.4 | 24.4 | 31.6 | 36.6 | 26.6 | 19.2 | 23.5 | 33.2 |
[
HF Checkpoints 🤗
](
https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_x_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_cc3mlite_train_pretrained-8cf6b025.pth
)
|
**NOTE:**
1.
AP
<sup>
mini
</sup>
: evaluated on LVIS
`minival`
.
3.
AP
<sup>
val
</sup>
: evaluated on LVIS
`val 1.0`
.
4.
[
HuggingFace Mirror
](
https://hf-mirror.com/
)
provides the mirror of HuggingFace, which is a choice for users who are unable to reach.
\ No newline at end of file
configs/pretrain_v1/yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
0 → 100644
View file @
e9cee049
_base_
=
(
'../../third_party/mmyolo/configs/yolov8/'
'yolov8_l_syncbn_fast_8xb16-500e_coco.py'
)
custom_imports
=
dict
(
imports
=
[
'yolo_world'
],
allow_failed_imports
=
False
)
# hyper-parameters
num_classes
=
1203
num_training_classes
=
80
max_epochs
=
100
# Maximum training epochs
close_mosaic_epochs
=
2
save_epoch_intervals
=
2
text_channels
=
512
neck_embed_channels
=
[
128
,
256
,
_base_
.
last_stage_out_channels
//
2
]
neck_num_heads
=
[
4
,
8
,
_base_
.
last_stage_out_channels
//
2
//
32
]
base_lr
=
2e-3
weight_decay
=
0.05
/
2
train_batch_size_per_gpu
=
16
# model settings
model
=
dict
(
type
=
'YOLOWorldDetector'
,
mm_neck
=
True
,
num_train_classes
=
num_training_classes
,
num_test_classes
=
num_classes
,
data_preprocessor
=
dict
(
type
=
'YOLOWDetDataPreprocessor'
),
backbone
=
dict
(
_delete_
=
True
,
type
=
'MultiModalYOLOBackbone'
,
image_model
=
{{
_base_
.
model
.
backbone
}},
text_model
=
dict
(
type
=
'HuggingCLIPLanguageBackbone'
,
model_name
=
'openai/clip-vit-base-patch32'
,
frozen_modules
=
[
'all'
])),
neck
=
dict
(
type
=
'YOLOWorldDualPAFPN'
,
guide_channels
=
text_channels
,
embed_channels
=
neck_embed_channels
,
num_heads
=
neck_num_heads
,
block_cfg
=
dict
(
type
=
'MaxSigmoidCSPLayerWithTwoConv'
),
text_enhancder
=
dict
(
type
=
'ImagePoolingAttentionModule'
,
embed_channels
=
256
,
num_heads
=
8
)),
bbox_head
=
dict
(
type
=
'YOLOWorldHead'
,
head_module
=
dict
(
type
=
'YOLOWorldHeadModule'
,
embed_dims
=
text_channels
,
num_classes
=
num_training_classes
)),
train_cfg
=
dict
(
assigner
=
dict
(
num_classes
=
num_training_classes
)))
# dataset settings
text_transform
=
[
dict
(
type
=
'RandomLoadText'
,
num_neg_samples
=
(
num_classes
,
num_classes
),
max_num_samples
=
num_training_classes
,
padding_to_max
=
True
,
padding_value
=
''
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'flip'
,
'flip_direction'
,
'texts'
))
]
train_pipeline
=
[
*
_base_
.
pre_transform
,
dict
(
type
=
'MultiModalMosaic'
,
img_scale
=
_base_
.
img_scale
,
pad_val
=
114.0
,
pre_transform
=
_base_
.
pre_transform
),
dict
(
type
=
'YOLOv5RandomAffine'
,
max_rotate_degree
=
0.0
,
max_shear_degree
=
0.0
,
scaling_ratio_range
=
(
1
-
_base_
.
affine_scale
,
1
+
_base_
.
affine_scale
),
max_aspect_ratio
=
_base_
.
max_aspect_ratio
,
border
=
(
-
_base_
.
img_scale
[
0
]
//
2
,
-
_base_
.
img_scale
[
1
]
//
2
),
border_val
=
(
114
,
114
,
114
)),
*
_base_
.
last_transform
[:
-
1
],
*
text_transform
,
]
train_pipeline_stage2
=
[
*
_base_
.
train_pipeline_stage2
[:
-
1
],
*
text_transform
]
obj365v1_train_dataset
=
dict
(
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5Objects365V1Dataset'
,
data_root
=
'data/objects365v1/'
,
ann_file
=
'annotations/objects365_train.json'
,
data_prefix
=
dict
(
img
=
'train/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
)),
class_text_path
=
'data/texts/obj365v1_class_texts.json'
,
pipeline
=
train_pipeline
)
mg_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/mixed_grounding/'
,
ann_file
=
'annotations/final_mixed_train_no_coco.json'
,
data_prefix
=
dict
(
img
=
'gqa/images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
False
,
min_size
=
32
),
pipeline
=
train_pipeline
)
flickr_train_dataset
=
dict
(
type
=
'YOLOv5MixedGroundingDataset'
,
data_root
=
'data/flickr/'
,
ann_file
=
'annotations/final_flickr_separateGT_train.json'
,
data_prefix
=
dict
(
img
=
'full_images/'
),
filter_cfg
=
dict
(
filter_empty_gt
=
True
,
min_size
=
32
),
pipeline
=
train_pipeline
)
train_dataloader
=
dict
(
batch_size
=
train_batch_size_per_gpu
,
collate_fn
=
dict
(
type
=
'yolow_collate'
),
dataset
=
dict
(
_delete_
=
True
,
type
=
'ConcatDataset'
,
datasets
=
[
obj365v1_train_dataset
,
flickr_train_dataset
,
mg_train_dataset
],
ignore_keys
=
[
'classes'
,
'palette'
]))
test_pipeline
=
[
*
_base_
.
test_pipeline
[:
-
1
],
dict
(
type
=
'LoadText'
),
dict
(
type
=
'mmdet.PackDetInputs'
,
meta_keys
=
(
'img_id'
,
'img_path'
,
'ori_shape'
,
'img_shape'
,
'scale_factor'
,
'pad_param'
,
'texts'
))
]
coco_val_dataset
=
dict
(
_delete_
=
True
,
type
=
'MultiModalDataset'
,
dataset
=
dict
(
type
=
'YOLOv5LVISV1Dataset'
,
data_root
=
'data/coco/'
,
test_mode
=
True
,
ann_file
=
'lvis/lvis_v1_minival_inserted_image_name.json'
,
data_prefix
=
dict
(
img
=
''
),
batch_shapes_cfg
=
None
),
class_text_path
=
'data/texts/lvis_v1_class_texts.json'
,
pipeline
=
test_pipeline
)
val_dataloader
=
dict
(
dataset
=
coco_val_dataset
)
test_dataloader
=
val_dataloader
val_evaluator
=
dict
(
type
=
'mmdet.LVISMetric'
,
ann_file
=
'data/coco/lvis/lvis_v1_minival_inserted_image_name.json'
,
metric
=
'bbox'
)
test_evaluator
=
val_evaluator
# training settings
default_hooks
=
dict
(
param_scheduler
=
dict
(
max_epochs
=
max_epochs
),
checkpoint
=
dict
(
interval
=
save_epoch_intervals
,
rule
=
'greater'
))
custom_hooks
=
[
dict
(
type
=
'EMAHook'
,
ema_type
=
'ExpMomentumEMA'
,
momentum
=
0.0001
,
update_buffers
=
True
,
strict_load
=
False
,
priority
=
49
),
dict
(
type
=
'mmdet.PipelineSwitchHook'
,
switch_epoch
=
max_epochs
-
close_mosaic_epochs
,
switch_pipeline
=
train_pipeline_stage2
)
]
train_cfg
=
dict
(
max_epochs
=
max_epochs
,
val_interval
=
10
,
dynamic_intervals
=
[((
max_epochs
-
close_mosaic_epochs
),
_base_
.
val_interval_stage2
)])
optim_wrapper
=
dict
(
optimizer
=
dict
(
_delete_
=
True
,
type
=
'AdamW'
,
lr
=
base_lr
,
weight_decay
=
weight_decay
,
batch_size_per_gpu
=
train_batch_size_per_gpu
),
paramwise_cfg
=
dict
(
bias_decay_mult
=
0.0
,
norm_decay_mult
=
0.0
,
custom_keys
=
{
'backbone.text_model'
:
dict
(
lr_mult
=
0.01
),
'logit_scale'
:
dict
(
weight_decay
=
0.0
)
}),
constructor
=
'YOLOWv5OptimizerConstructor'
)
Prev
1
2
3
4
5
6
…
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment