Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
0225b135
Unverified
Commit
0225b135
authored
Mar 05, 2022
by
Srihari Humbarwadi
Committed by
GitHub
Mar 05, 2022
Browse files
Merge branch 'tensorflow:master' into panoptic-deeplab-modeling
parents
7479dbb8
4c571a3c
Changes
332
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3875 additions
and
0 deletions
+3875
-0
official/vision/configs/image_classification_test.py
official/vision/configs/image_classification_test.py
+49
-0
official/vision/configs/maskrcnn.py
official/vision/configs/maskrcnn.py
+523
-0
official/vision/configs/maskrcnn_test.py
official/vision/configs/maskrcnn_test.py
+47
-0
official/vision/configs/retinanet.py
official/vision/configs/retinanet.py
+438
-0
official/vision/configs/retinanet_test.py
official/vision/configs/retinanet_test.py
+46
-0
official/vision/configs/semantic_segmentation.py
official/vision/configs/semantic_segmentation.py
+713
-0
official/vision/configs/semantic_segmentation_test.py
official/vision/configs/semantic_segmentation_test.py
+46
-0
official/vision/configs/video_classification.py
official/vision/configs/video_classification.py
+371
-0
official/vision/configs/video_classification_test.py
official/vision/configs/video_classification_test.py
+45
-0
official/vision/data/__init__.py
official/vision/data/__init__.py
+14
-0
official/vision/data/create_coco_tf_record.py
official/vision/data/create_coco_tf_record.py
+554
-0
official/vision/data/process_coco_few_shot.sh
official/vision/data/process_coco_few_shot.sh
+70
-0
official/vision/data/process_coco_few_shot_json_files.py
official/vision/data/process_coco_few_shot_json_files.py
+144
-0
official/vision/data/process_coco_panoptic.sh
official/vision/data/process_coco_panoptic.sh
+40
-0
official/vision/data/tfrecord_lib.py
official/vision/data/tfrecord_lib.py
+181
-0
official/vision/data/tfrecord_lib_test.py
official/vision/data/tfrecord_lib_test.py
+93
-0
official/vision/dataloaders/__init__.py
official/vision/dataloaders/__init__.py
+14
-0
official/vision/dataloaders/classification_input.py
official/vision/dataloaders/classification_input.py
+273
-0
official/vision/dataloaders/decoder.py
official/vision/dataloaders/decoder.py
+35
-0
official/vision/dataloaders/input_reader.py
official/vision/dataloaders/input_reader.py
+179
-0
No files found.
official/vision/configs/image_classification_test.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for image_classification."""
# pylint: disable=unused-import
from
absl.testing
import
parameterized
import
tensorflow
as
tf
from
official
import
vision
from
official.core
import
config_definitions
as
cfg
from
official.core
import
exp_factory
from
official.vision.configs
import
image_classification
as
exp_cfg
class
ImageClassificationConfigTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
@
parameterized
.
parameters
(
(
'resnet_imagenet'
,),
(
'resnet_rs_imagenet'
,),
(
'revnet_imagenet'
,),
(
'mobilenet_imagenet'
),
)
def
test_image_classification_configs
(
self
,
config_name
):
config
=
exp_factory
.
get_exp_config
(
config_name
)
self
.
assertIsInstance
(
config
,
cfg
.
ExperimentConfig
)
self
.
assertIsInstance
(
config
.
task
,
exp_cfg
.
ImageClassificationTask
)
self
.
assertIsInstance
(
config
.
task
.
model
,
exp_cfg
.
ImageClassificationModel
)
self
.
assertIsInstance
(
config
.
task
.
train_data
,
exp_cfg
.
DataConfig
)
config
.
validate
()
config
.
task
.
train_data
.
is_training
=
None
with
self
.
assertRaises
(
KeyError
):
config
.
validate
()
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/configs/maskrcnn.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""R-CNN(-RS) configuration definition."""
import
dataclasses
import
os
from
typing
import
List
,
Optional
,
Union
from
official.core
import
config_definitions
as
cfg
from
official.core
import
exp_factory
from
official.modeling
import
hyperparams
from
official.modeling
import
optimization
from
official.vision.configs
import
common
from
official.vision.configs
import
decoders
from
official.vision.configs
import
backbones
# pylint: disable=missing-class-docstring
@
dataclasses
.
dataclass
class
Parser
(
hyperparams
.
Config
):
num_channels
:
int
=
3
match_threshold
:
float
=
0.5
unmatched_threshold
:
float
=
0.5
aug_rand_hflip
:
bool
=
False
aug_scale_min
:
float
=
1.0
aug_scale_max
:
float
=
1.0
skip_crowd_during_training
:
bool
=
True
max_num_instances
:
int
=
100
rpn_match_threshold
:
float
=
0.7
rpn_unmatched_threshold
:
float
=
0.3
rpn_batch_size_per_im
:
int
=
256
rpn_fg_fraction
:
float
=
0.5
mask_crop_size
:
int
=
112
@
dataclasses
.
dataclass
class
DataConfig
(
cfg
.
DataConfig
):
"""Input config for training."""
input_path
:
str
=
''
global_batch_size
:
int
=
0
is_training
:
bool
=
False
dtype
:
str
=
'bfloat16'
decoder
:
common
.
DataDecoder
=
common
.
DataDecoder
()
parser
:
Parser
=
Parser
()
shuffle_buffer_size
:
int
=
10000
file_type
:
str
=
'tfrecord'
drop_remainder
:
bool
=
True
# Number of examples in the data set, it's used to create the annotation file.
num_examples
:
int
=
-
1
@
dataclasses
.
dataclass
class
Anchor
(
hyperparams
.
Config
):
num_scales
:
int
=
1
aspect_ratios
:
List
[
float
]
=
dataclasses
.
field
(
default_factory
=
lambda
:
[
0.5
,
1.0
,
2.0
])
anchor_size
:
float
=
8.0
@
dataclasses
.
dataclass
class
RPNHead
(
hyperparams
.
Config
):
num_convs
:
int
=
1
num_filters
:
int
=
256
use_separable_conv
:
bool
=
False
@
dataclasses
.
dataclass
class
DetectionHead
(
hyperparams
.
Config
):
num_convs
:
int
=
4
num_filters
:
int
=
256
use_separable_conv
:
bool
=
False
num_fcs
:
int
=
1
fc_dims
:
int
=
1024
class_agnostic_bbox_pred
:
bool
=
False
# Has to be True for Cascade RCNN.
# If additional IoUs are passed in 'cascade_iou_thresholds'
# then ensemble the class probabilities from all heads.
cascade_class_ensemble
:
bool
=
False
@
dataclasses
.
dataclass
class
ROIGenerator
(
hyperparams
.
Config
):
pre_nms_top_k
:
int
=
2000
pre_nms_score_threshold
:
float
=
0.0
pre_nms_min_size_threshold
:
float
=
0.0
nms_iou_threshold
:
float
=
0.7
num_proposals
:
int
=
1000
test_pre_nms_top_k
:
int
=
1000
test_pre_nms_score_threshold
:
float
=
0.0
test_pre_nms_min_size_threshold
:
float
=
0.0
test_nms_iou_threshold
:
float
=
0.7
test_num_proposals
:
int
=
1000
use_batched_nms
:
bool
=
False
@
dataclasses
.
dataclass
class
ROISampler
(
hyperparams
.
Config
):
mix_gt_boxes
:
bool
=
True
num_sampled_rois
:
int
=
512
foreground_fraction
:
float
=
0.25
foreground_iou_threshold
:
float
=
0.5
background_iou_high_threshold
:
float
=
0.5
background_iou_low_threshold
:
float
=
0.0
# IoU thresholds for additional FRCNN heads in Cascade mode.
# `foreground_iou_threshold` is the first threshold.
cascade_iou_thresholds
:
Optional
[
List
[
float
]]
=
None
@
dataclasses
.
dataclass
class
ROIAligner
(
hyperparams
.
Config
):
crop_size
:
int
=
7
sample_offset
:
float
=
0.5
@
dataclasses
.
dataclass
class
DetectionGenerator
(
hyperparams
.
Config
):
apply_nms
:
bool
=
True
pre_nms_top_k
:
int
=
5000
pre_nms_score_threshold
:
float
=
0.05
nms_iou_threshold
:
float
=
0.5
max_num_detections
:
int
=
100
nms_version
:
str
=
'v2'
# `v2`, `v1`, `batched`
use_cpu_nms
:
bool
=
False
soft_nms_sigma
:
Optional
[
float
]
=
None
# Only works when nms_version='v1'.
@
dataclasses
.
dataclass
class
MaskHead
(
hyperparams
.
Config
):
upsample_factor
:
int
=
2
num_convs
:
int
=
4
num_filters
:
int
=
256
use_separable_conv
:
bool
=
False
class_agnostic
:
bool
=
False
@
dataclasses
.
dataclass
class
MaskSampler
(
hyperparams
.
Config
):
num_sampled_masks
:
int
=
128
@
dataclasses
.
dataclass
class
MaskROIAligner
(
hyperparams
.
Config
):
crop_size
:
int
=
14
sample_offset
:
float
=
0.5
@
dataclasses
.
dataclass
class
MaskRCNN
(
hyperparams
.
Config
):
num_classes
:
int
=
0
input_size
:
List
[
int
]
=
dataclasses
.
field
(
default_factory
=
list
)
min_level
:
int
=
2
max_level
:
int
=
6
anchor
:
Anchor
=
Anchor
()
include_mask
:
bool
=
True
backbone
:
backbones
.
Backbone
=
backbones
.
Backbone
(
type
=
'resnet'
,
resnet
=
backbones
.
ResNet
())
decoder
:
decoders
.
Decoder
=
decoders
.
Decoder
(
type
=
'fpn'
,
fpn
=
decoders
.
FPN
())
rpn_head
:
RPNHead
=
RPNHead
()
detection_head
:
DetectionHead
=
DetectionHead
()
roi_generator
:
ROIGenerator
=
ROIGenerator
()
roi_sampler
:
ROISampler
=
ROISampler
()
roi_aligner
:
ROIAligner
=
ROIAligner
()
detection_generator
:
DetectionGenerator
=
DetectionGenerator
()
mask_head
:
Optional
[
MaskHead
]
=
MaskHead
()
mask_sampler
:
Optional
[
MaskSampler
]
=
MaskSampler
()
mask_roi_aligner
:
Optional
[
MaskROIAligner
]
=
MaskROIAligner
()
norm_activation
:
common
.
NormActivation
=
common
.
NormActivation
(
norm_momentum
=
0.997
,
norm_epsilon
=
0.0001
,
use_sync_bn
=
True
)
@
dataclasses
.
dataclass
class
Losses
(
hyperparams
.
Config
):
loss_weight
:
float
=
1.0
rpn_huber_loss_delta
:
float
=
1.
/
9.
frcnn_huber_loss_delta
:
float
=
1.
l2_weight_decay
:
float
=
0.0
rpn_score_weight
:
float
=
1.0
rpn_box_weight
:
float
=
1.0
frcnn_class_weight
:
float
=
1.0
frcnn_box_weight
:
float
=
1.0
mask_weight
:
float
=
1.0
@
dataclasses
.
dataclass
class
MaskRCNNTask
(
cfg
.
TaskConfig
):
model
:
MaskRCNN
=
MaskRCNN
()
train_data
:
DataConfig
=
DataConfig
(
is_training
=
True
)
validation_data
:
DataConfig
=
DataConfig
(
is_training
=
False
,
drop_remainder
=
False
)
losses
:
Losses
=
Losses
()
init_checkpoint
:
Optional
[
str
]
=
None
init_checkpoint_modules
:
Union
[
str
,
List
[
str
]]
=
'all'
# all, backbone, and/or decoder
annotation_file
:
Optional
[
str
]
=
None
per_category_metrics
:
bool
=
False
# If set, we only use masks for the specified class IDs.
allowed_mask_class_ids
:
Optional
[
List
[
int
]]
=
None
# If set, the COCO metrics will be computed.
use_coco_metrics
:
bool
=
True
# If set, the Waymo Open Dataset evaluator would be used.
use_wod_metrics
:
bool
=
False
COCO_INPUT_PATH_BASE
=
'coco'
@
exp_factory
.
register_config_factory
(
'fasterrcnn_resnetfpn_coco'
)
def
fasterrcnn_resnetfpn_coco
()
->
cfg
.
ExperimentConfig
:
"""COCO object detection with Faster R-CNN."""
steps_per_epoch
=
500
coco_val_samples
=
5000
train_batch_size
=
64
eval_batch_size
=
8
config
=
cfg
.
ExperimentConfig
(
runtime
=
cfg
.
RuntimeConfig
(
mixed_precision_dtype
=
'bfloat16'
),
task
=
MaskRCNNTask
(
init_checkpoint
=
'gs://cloud-tpu-checkpoints/vision-2.0/resnet50_imagenet/ckpt-28080'
,
init_checkpoint_modules
=
'backbone'
,
annotation_file
=
os
.
path
.
join
(
COCO_INPUT_PATH_BASE
,
'instances_val2017.json'
),
model
=
MaskRCNN
(
num_classes
=
91
,
input_size
=
[
1024
,
1024
,
3
],
include_mask
=
False
,
mask_head
=
None
,
mask_sampler
=
None
,
mask_roi_aligner
=
None
),
losses
=
Losses
(
l2_weight_decay
=
0.00004
),
train_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
COCO_INPUT_PATH_BASE
,
'train*'
),
is_training
=
True
,
global_batch_size
=
train_batch_size
,
parser
=
Parser
(
aug_rand_hflip
=
True
,
aug_scale_min
=
0.8
,
aug_scale_max
=
1.25
)),
validation_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
COCO_INPUT_PATH_BASE
,
'val*'
),
is_training
=
False
,
global_batch_size
=
eval_batch_size
,
drop_remainder
=
False
)),
trainer
=
cfg
.
TrainerConfig
(
train_steps
=
22500
,
validation_steps
=
coco_val_samples
//
eval_batch_size
,
validation_interval
=
steps_per_epoch
,
steps_per_loop
=
steps_per_epoch
,
summary_interval
=
steps_per_epoch
,
checkpoint_interval
=
steps_per_epoch
,
optimizer_config
=
optimization
.
OptimizationConfig
({
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'stepwise'
,
'stepwise'
:
{
'boundaries'
:
[
15000
,
20000
],
'values'
:
[
0.12
,
0.012
,
0.0012
],
}
},
'warmup'
:
{
'type'
:
'linear'
,
'linear'
:
{
'warmup_steps'
:
500
,
'warmup_learning_rate'
:
0.0067
}
}
})),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
])
return
config
@
exp_factory
.
register_config_factory
(
'maskrcnn_resnetfpn_coco'
)
def
maskrcnn_resnetfpn_coco
()
->
cfg
.
ExperimentConfig
:
"""COCO object detection with Mask R-CNN."""
steps_per_epoch
=
500
coco_val_samples
=
5000
train_batch_size
=
64
eval_batch_size
=
8
config
=
cfg
.
ExperimentConfig
(
runtime
=
cfg
.
RuntimeConfig
(
mixed_precision_dtype
=
'bfloat16'
,
enable_xla
=
True
),
task
=
MaskRCNNTask
(
init_checkpoint
=
'gs://cloud-tpu-checkpoints/vision-2.0/resnet50_imagenet/ckpt-28080'
,
init_checkpoint_modules
=
'backbone'
,
annotation_file
=
os
.
path
.
join
(
COCO_INPUT_PATH_BASE
,
'instances_val2017.json'
),
model
=
MaskRCNN
(
num_classes
=
91
,
input_size
=
[
1024
,
1024
,
3
],
include_mask
=
True
),
losses
=
Losses
(
l2_weight_decay
=
0.00004
),
train_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
COCO_INPUT_PATH_BASE
,
'train*'
),
is_training
=
True
,
global_batch_size
=
train_batch_size
,
parser
=
Parser
(
aug_rand_hflip
=
True
,
aug_scale_min
=
0.8
,
aug_scale_max
=
1.25
)),
validation_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
COCO_INPUT_PATH_BASE
,
'val*'
),
is_training
=
False
,
global_batch_size
=
eval_batch_size
,
drop_remainder
=
False
)),
trainer
=
cfg
.
TrainerConfig
(
train_steps
=
22500
,
validation_steps
=
coco_val_samples
//
eval_batch_size
,
validation_interval
=
steps_per_epoch
,
steps_per_loop
=
steps_per_epoch
,
summary_interval
=
steps_per_epoch
,
checkpoint_interval
=
steps_per_epoch
,
optimizer_config
=
optimization
.
OptimizationConfig
({
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'stepwise'
,
'stepwise'
:
{
'boundaries'
:
[
15000
,
20000
],
'values'
:
[
0.12
,
0.012
,
0.0012
],
}
},
'warmup'
:
{
'type'
:
'linear'
,
'linear'
:
{
'warmup_steps'
:
500
,
'warmup_learning_rate'
:
0.0067
}
}
})),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
])
return
config
@
exp_factory
.
register_config_factory
(
'maskrcnn_spinenet_coco'
)
def
maskrcnn_spinenet_coco
()
->
cfg
.
ExperimentConfig
:
"""COCO object detection with Mask R-CNN with SpineNet backbone."""
steps_per_epoch
=
463
coco_val_samples
=
5000
train_batch_size
=
256
eval_batch_size
=
8
config
=
cfg
.
ExperimentConfig
(
runtime
=
cfg
.
RuntimeConfig
(
mixed_precision_dtype
=
'bfloat16'
),
task
=
MaskRCNNTask
(
annotation_file
=
os
.
path
.
join
(
COCO_INPUT_PATH_BASE
,
'instances_val2017.json'
),
model
=
MaskRCNN
(
backbone
=
backbones
.
Backbone
(
type
=
'spinenet'
,
spinenet
=
backbones
.
SpineNet
(
model_id
=
'49'
,
min_level
=
3
,
max_level
=
7
,
)),
decoder
=
decoders
.
Decoder
(
type
=
'identity'
,
identity
=
decoders
.
Identity
()),
anchor
=
Anchor
(
anchor_size
=
3
),
norm_activation
=
common
.
NormActivation
(
use_sync_bn
=
True
),
num_classes
=
91
,
input_size
=
[
640
,
640
,
3
],
min_level
=
3
,
max_level
=
7
,
include_mask
=
True
),
losses
=
Losses
(
l2_weight_decay
=
0.00004
),
train_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
COCO_INPUT_PATH_BASE
,
'train*'
),
is_training
=
True
,
global_batch_size
=
train_batch_size
,
parser
=
Parser
(
aug_rand_hflip
=
True
,
aug_scale_min
=
0.5
,
aug_scale_max
=
2.0
)),
validation_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
COCO_INPUT_PATH_BASE
,
'val*'
),
is_training
=
False
,
global_batch_size
=
eval_batch_size
,
drop_remainder
=
False
)),
trainer
=
cfg
.
TrainerConfig
(
train_steps
=
steps_per_epoch
*
350
,
validation_steps
=
coco_val_samples
//
eval_batch_size
,
validation_interval
=
steps_per_epoch
,
steps_per_loop
=
steps_per_epoch
,
summary_interval
=
steps_per_epoch
,
checkpoint_interval
=
steps_per_epoch
,
optimizer_config
=
optimization
.
OptimizationConfig
({
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'stepwise'
,
'stepwise'
:
{
'boundaries'
:
[
steps_per_epoch
*
320
,
steps_per_epoch
*
340
],
'values'
:
[
0.32
,
0.032
,
0.0032
],
}
},
'warmup'
:
{
'type'
:
'linear'
,
'linear'
:
{
'warmup_steps'
:
2000
,
'warmup_learning_rate'
:
0.0067
}
}
})),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
,
'task.model.min_level == task.model.backbone.spinenet.min_level'
,
'task.model.max_level == task.model.backbone.spinenet.max_level'
,
])
return
config
@
exp_factory
.
register_config_factory
(
'cascadercnn_spinenet_coco'
)
def
cascadercnn_spinenet_coco
()
->
cfg
.
ExperimentConfig
:
"""COCO object detection with Cascade RCNN-RS with SpineNet backbone."""
steps_per_epoch
=
463
coco_val_samples
=
5000
train_batch_size
=
256
eval_batch_size
=
8
config
=
cfg
.
ExperimentConfig
(
runtime
=
cfg
.
RuntimeConfig
(
mixed_precision_dtype
=
'bfloat16'
),
task
=
MaskRCNNTask
(
annotation_file
=
os
.
path
.
join
(
COCO_INPUT_PATH_BASE
,
'instances_val2017.json'
),
model
=
MaskRCNN
(
backbone
=
backbones
.
Backbone
(
type
=
'spinenet'
,
spinenet
=
backbones
.
SpineNet
(
model_id
=
'49'
,
min_level
=
3
,
max_level
=
7
,
)),
decoder
=
decoders
.
Decoder
(
type
=
'identity'
,
identity
=
decoders
.
Identity
()),
roi_sampler
=
ROISampler
(
cascade_iou_thresholds
=
[
0.6
,
0.7
]),
detection_head
=
DetectionHead
(
class_agnostic_bbox_pred
=
True
,
cascade_class_ensemble
=
True
),
anchor
=
Anchor
(
anchor_size
=
3
),
norm_activation
=
common
.
NormActivation
(
use_sync_bn
=
True
,
activation
=
'swish'
),
num_classes
=
91
,
input_size
=
[
640
,
640
,
3
],
min_level
=
3
,
max_level
=
7
,
include_mask
=
True
),
losses
=
Losses
(
l2_weight_decay
=
0.00004
),
train_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
COCO_INPUT_PATH_BASE
,
'train*'
),
is_training
=
True
,
global_batch_size
=
train_batch_size
,
parser
=
Parser
(
aug_rand_hflip
=
True
,
aug_scale_min
=
0.1
,
aug_scale_max
=
2.5
)),
validation_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
COCO_INPUT_PATH_BASE
,
'val*'
),
is_training
=
False
,
global_batch_size
=
eval_batch_size
,
drop_remainder
=
False
)),
trainer
=
cfg
.
TrainerConfig
(
train_steps
=
steps_per_epoch
*
500
,
validation_steps
=
coco_val_samples
//
eval_batch_size
,
validation_interval
=
steps_per_epoch
,
steps_per_loop
=
steps_per_epoch
,
summary_interval
=
steps_per_epoch
,
checkpoint_interval
=
steps_per_epoch
,
optimizer_config
=
optimization
.
OptimizationConfig
({
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'stepwise'
,
'stepwise'
:
{
'boundaries'
:
[
steps_per_epoch
*
475
,
steps_per_epoch
*
490
],
'values'
:
[
0.32
,
0.032
,
0.0032
],
}
},
'warmup'
:
{
'type'
:
'linear'
,
'linear'
:
{
'warmup_steps'
:
2000
,
'warmup_learning_rate'
:
0.0067
}
}
})),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
,
'task.model.min_level == task.model.backbone.spinenet.min_level'
,
'task.model.max_level == task.model.backbone.spinenet.max_level'
,
])
return
config
official/vision/configs/maskrcnn_test.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for maskrcnn."""
# pylint: disable=unused-import
from
absl.testing
import
parameterized
import
tensorflow
as
tf
from
official
import
vision
from
official.core
import
config_definitions
as
cfg
from
official.core
import
exp_factory
from
official.vision.configs
import
maskrcnn
as
exp_cfg
class
MaskRCNNConfigTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
@
parameterized
.
parameters
(
(
'fasterrcnn_resnetfpn_coco'
,),
(
'maskrcnn_resnetfpn_coco'
,),
(
'maskrcnn_spinenet_coco'
,),
(
'cascadercnn_spinenet_coco'
,),
)
def
test_maskrcnn_configs
(
self
,
config_name
):
config
=
exp_factory
.
get_exp_config
(
config_name
)
self
.
assertIsInstance
(
config
,
cfg
.
ExperimentConfig
)
self
.
assertIsInstance
(
config
.
task
,
exp_cfg
.
MaskRCNNTask
)
self
.
assertIsInstance
(
config
.
task
.
model
,
exp_cfg
.
MaskRCNN
)
self
.
assertIsInstance
(
config
.
task
.
train_data
,
exp_cfg
.
DataConfig
)
config
.
validate
()
config
.
task
.
train_data
.
is_training
=
None
with
self
.
assertRaisesRegex
(
KeyError
,
'Found inconsistncy between key'
):
config
.
validate
()
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/configs/retinanet.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""RetinaNet configuration definition."""
import
dataclasses
import
os
from
typing
import
List
,
Optional
,
Union
from
official.core
import
config_definitions
as
cfg
from
official.core
import
exp_factory
from
official.modeling
import
hyperparams
from
official.modeling
import
optimization
from
official.vision.configs
import
common
from
official.vision.configs
import
decoders
from
official.vision.configs
import
backbones
# pylint: disable=missing-class-docstring
# Keep for backward compatibility.
@
dataclasses
.
dataclass
class
TfExampleDecoder
(
common
.
TfExampleDecoder
):
"""A simple TF Example decoder config."""
# Keep for backward compatibility.
@
dataclasses
.
dataclass
class
TfExampleDecoderLabelMap
(
common
.
TfExampleDecoderLabelMap
):
"""TF Example decoder with label map config."""
# Keep for backward compatibility.
@
dataclasses
.
dataclass
class
DataDecoder
(
common
.
DataDecoder
):
"""Data decoder config."""
@
dataclasses
.
dataclass
class
Parser
(
hyperparams
.
Config
):
num_channels
:
int
=
3
match_threshold
:
float
=
0.5
unmatched_threshold
:
float
=
0.5
aug_rand_hflip
:
bool
=
False
aug_scale_min
:
float
=
1.0
aug_scale_max
:
float
=
1.0
skip_crowd_during_training
:
bool
=
True
max_num_instances
:
int
=
100
# Can choose AutoAugment and RandAugment.
aug_type
:
Optional
[
common
.
Augmentation
]
=
None
# Keep for backward compatibility. Not used.
aug_policy
:
Optional
[
str
]
=
None
@
dataclasses
.
dataclass
class
DataConfig
(
cfg
.
DataConfig
):
"""Input config for training."""
input_path
:
str
=
''
global_batch_size
:
int
=
0
is_training
:
bool
=
False
dtype
:
str
=
'bfloat16'
decoder
:
common
.
DataDecoder
=
common
.
DataDecoder
()
parser
:
Parser
=
Parser
()
shuffle_buffer_size
:
int
=
10000
file_type
:
str
=
'tfrecord'
@
dataclasses
.
dataclass
class
Anchor
(
hyperparams
.
Config
):
num_scales
:
int
=
3
aspect_ratios
:
List
[
float
]
=
dataclasses
.
field
(
default_factory
=
lambda
:
[
0.5
,
1.0
,
2.0
])
anchor_size
:
float
=
4.0
@
dataclasses
.
dataclass
class
Losses
(
hyperparams
.
Config
):
loss_weight
:
float
=
1.0
focal_loss_alpha
:
float
=
0.25
focal_loss_gamma
:
float
=
1.5
huber_loss_delta
:
float
=
0.1
box_loss_weight
:
int
=
50
l2_weight_decay
:
float
=
0.0
@
dataclasses
.
dataclass
class
AttributeHead
(
hyperparams
.
Config
):
name
:
str
=
''
type
:
str
=
'regression'
size
:
int
=
1
@
dataclasses
.
dataclass
class
RetinaNetHead
(
hyperparams
.
Config
):
num_convs
:
int
=
4
num_filters
:
int
=
256
use_separable_conv
:
bool
=
False
attribute_heads
:
List
[
AttributeHead
]
=
dataclasses
.
field
(
default_factory
=
list
)
@
dataclasses
.
dataclass
class
DetectionGenerator
(
hyperparams
.
Config
):
apply_nms
:
bool
=
True
pre_nms_top_k
:
int
=
5000
pre_nms_score_threshold
:
float
=
0.05
nms_iou_threshold
:
float
=
0.5
max_num_detections
:
int
=
100
nms_version
:
str
=
'v2'
# `v2`, `v1`, `batched`, or `tflite`.
use_cpu_nms
:
bool
=
False
soft_nms_sigma
:
Optional
[
float
]
=
None
# Only works when nms_version='v1'.
# When nms_version = `tflite`, values from tflite_post_processing need to be
# specified. They are compatible with the input arguments used by TFLite
# custom NMS op and override above parameters.
tflite_post_processing
:
common
.
TFLitePostProcessingConfig
=
common
.
TFLitePostProcessingConfig
(
)
max_detections
:
int
=
200
max_classes_per_detection
:
int
=
5
# Regular NMS run in a multi-class fashion and is slow. Setting it to False
# uses class-agnostic NMS, which is faster.
use_regular_nms
:
bool
=
False
nms_score_threshold
:
float
=
0.1
@
dataclasses
.
dataclass
class
RetinaNet
(
hyperparams
.
Config
):
num_classes
:
int
=
0
input_size
:
List
[
int
]
=
dataclasses
.
field
(
default_factory
=
list
)
min_level
:
int
=
3
max_level
:
int
=
7
anchor
:
Anchor
=
Anchor
()
backbone
:
backbones
.
Backbone
=
backbones
.
Backbone
(
type
=
'resnet'
,
resnet
=
backbones
.
ResNet
())
decoder
:
decoders
.
Decoder
=
decoders
.
Decoder
(
type
=
'fpn'
,
fpn
=
decoders
.
FPN
())
head
:
RetinaNetHead
=
RetinaNetHead
()
detection_generator
:
DetectionGenerator
=
DetectionGenerator
()
norm_activation
:
common
.
NormActivation
=
common
.
NormActivation
()
@
dataclasses
.
dataclass
class
ExportConfig
(
hyperparams
.
Config
):
output_normalized_coordinates
:
bool
=
False
cast_num_detections_to_float
:
bool
=
False
cast_detection_classes_to_float
:
bool
=
False
@
dataclasses
.
dataclass
class
RetinaNetTask
(
cfg
.
TaskConfig
):
model
:
RetinaNet
=
RetinaNet
()
train_data
:
DataConfig
=
DataConfig
(
is_training
=
True
)
validation_data
:
DataConfig
=
DataConfig
(
is_training
=
False
)
losses
:
Losses
=
Losses
()
init_checkpoint
:
Optional
[
str
]
=
None
init_checkpoint_modules
:
Union
[
str
,
List
[
str
]]
=
'all'
# all, backbone, and/or decoder
annotation_file
:
Optional
[
str
]
=
None
per_category_metrics
:
bool
=
False
export_config
:
ExportConfig
=
ExportConfig
()
# If set, the COCO metrics will be computed.
use_coco_metrics
:
bool
=
True
# If set, the Waymo Open Dataset evaluator would be used.
use_wod_metrics
:
bool
=
False
@
exp_factory
.
register_config_factory
(
'retinanet'
)
def
retinanet
()
->
cfg
.
ExperimentConfig
:
"""RetinaNet general config."""
return
cfg
.
ExperimentConfig
(
task
=
RetinaNetTask
(),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
])
COCO_INPUT_PATH_BASE
=
'coco'
COCO_TRAIN_EXAMPLES
=
118287
COCO_VAL_EXAMPLES
=
5000
@
exp_factory
.
register_config_factory
(
'retinanet_resnetfpn_coco'
)
def
retinanet_resnetfpn_coco
()
->
cfg
.
ExperimentConfig
:
"""COCO object detection with RetinaNet."""
train_batch_size
=
256
eval_batch_size
=
8
steps_per_epoch
=
COCO_TRAIN_EXAMPLES
//
train_batch_size
config
=
cfg
.
ExperimentConfig
(
runtime
=
cfg
.
RuntimeConfig
(
mixed_precision_dtype
=
'bfloat16'
),
task
=
RetinaNetTask
(
init_checkpoint
=
'gs://cloud-tpu-checkpoints/vision-2.0/resnet50_imagenet/ckpt-28080'
,
init_checkpoint_modules
=
'backbone'
,
annotation_file
=
os
.
path
.
join
(
COCO_INPUT_PATH_BASE
,
'instances_val2017.json'
),
model
=
RetinaNet
(
num_classes
=
91
,
input_size
=
[
640
,
640
,
3
],
norm_activation
=
common
.
NormActivation
(
use_sync_bn
=
False
),
min_level
=
3
,
max_level
=
7
),
losses
=
Losses
(
l2_weight_decay
=
1e-4
),
train_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
COCO_INPUT_PATH_BASE
,
'train*'
),
is_training
=
True
,
global_batch_size
=
train_batch_size
,
parser
=
Parser
(
aug_rand_hflip
=
True
,
aug_scale_min
=
0.8
,
aug_scale_max
=
1.2
)),
validation_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
COCO_INPUT_PATH_BASE
,
'val*'
),
is_training
=
False
,
global_batch_size
=
eval_batch_size
)),
trainer
=
cfg
.
TrainerConfig
(
train_steps
=
72
*
steps_per_epoch
,
validation_steps
=
COCO_VAL_EXAMPLES
//
eval_batch_size
,
validation_interval
=
steps_per_epoch
,
steps_per_loop
=
steps_per_epoch
,
summary_interval
=
steps_per_epoch
,
checkpoint_interval
=
steps_per_epoch
,
optimizer_config
=
optimization
.
OptimizationConfig
({
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'stepwise'
,
'stepwise'
:
{
'boundaries'
:
[
57
*
steps_per_epoch
,
67
*
steps_per_epoch
],
'values'
:
[
0.32
*
train_batch_size
/
256.0
,
0.032
*
train_batch_size
/
256.0
,
0.0032
*
train_batch_size
/
256.0
],
}
},
'warmup'
:
{
'type'
:
'linear'
,
'linear'
:
{
'warmup_steps'
:
500
,
'warmup_learning_rate'
:
0.0067
}
}
})),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
])
return
config
@
exp_factory
.
register_config_factory
(
'retinanet_spinenet_coco'
)
def
retinanet_spinenet_coco
()
->
cfg
.
ExperimentConfig
:
"""COCO object detection with RetinaNet using SpineNet backbone."""
train_batch_size
=
256
eval_batch_size
=
8
steps_per_epoch
=
COCO_TRAIN_EXAMPLES
//
train_batch_size
input_size
=
640
config
=
cfg
.
ExperimentConfig
(
runtime
=
cfg
.
RuntimeConfig
(
mixed_precision_dtype
=
'float32'
),
task
=
RetinaNetTask
(
annotation_file
=
os
.
path
.
join
(
COCO_INPUT_PATH_BASE
,
'instances_val2017.json'
),
model
=
RetinaNet
(
backbone
=
backbones
.
Backbone
(
type
=
'spinenet'
,
spinenet
=
backbones
.
SpineNet
(
model_id
=
'49'
,
stochastic_depth_drop_rate
=
0.2
,
min_level
=
3
,
max_level
=
7
)),
decoder
=
decoders
.
Decoder
(
type
=
'identity'
,
identity
=
decoders
.
Identity
()),
anchor
=
Anchor
(
anchor_size
=
3
),
norm_activation
=
common
.
NormActivation
(
use_sync_bn
=
True
,
activation
=
'swish'
),
num_classes
=
91
,
input_size
=
[
input_size
,
input_size
,
3
],
min_level
=
3
,
max_level
=
7
),
losses
=
Losses
(
l2_weight_decay
=
4e-5
),
train_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
COCO_INPUT_PATH_BASE
,
'train*'
),
is_training
=
True
,
global_batch_size
=
train_batch_size
,
parser
=
Parser
(
aug_rand_hflip
=
True
,
aug_scale_min
=
0.1
,
aug_scale_max
=
2.0
)),
validation_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
COCO_INPUT_PATH_BASE
,
'val*'
),
is_training
=
False
,
global_batch_size
=
eval_batch_size
)),
trainer
=
cfg
.
TrainerConfig
(
train_steps
=
500
*
steps_per_epoch
,
validation_steps
=
COCO_VAL_EXAMPLES
//
eval_batch_size
,
validation_interval
=
steps_per_epoch
,
steps_per_loop
=
steps_per_epoch
,
summary_interval
=
steps_per_epoch
,
checkpoint_interval
=
steps_per_epoch
,
optimizer_config
=
optimization
.
OptimizationConfig
({
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'stepwise'
,
'stepwise'
:
{
'boundaries'
:
[
475
*
steps_per_epoch
,
490
*
steps_per_epoch
],
'values'
:
[
0.32
*
train_batch_size
/
256.0
,
0.032
*
train_batch_size
/
256.0
,
0.0032
*
train_batch_size
/
256.0
],
}
},
'warmup'
:
{
'type'
:
'linear'
,
'linear'
:
{
'warmup_steps'
:
2000
,
'warmup_learning_rate'
:
0.0067
}
}
})),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
,
'task.model.min_level == task.model.backbone.spinenet.min_level'
,
'task.model.max_level == task.model.backbone.spinenet.max_level'
,
])
return
config
@
exp_factory
.
register_config_factory
(
'retinanet_mobile_coco'
)
def
retinanet_spinenet_mobile_coco
()
->
cfg
.
ExperimentConfig
:
"""COCO object detection with mobile RetinaNet."""
train_batch_size
=
256
eval_batch_size
=
8
steps_per_epoch
=
COCO_TRAIN_EXAMPLES
//
train_batch_size
input_size
=
384
config
=
cfg
.
ExperimentConfig
(
runtime
=
cfg
.
RuntimeConfig
(
mixed_precision_dtype
=
'float32'
),
task
=
RetinaNetTask
(
annotation_file
=
os
.
path
.
join
(
COCO_INPUT_PATH_BASE
,
'instances_val2017.json'
),
model
=
RetinaNet
(
backbone
=
backbones
.
Backbone
(
type
=
'spinenet_mobile'
,
spinenet_mobile
=
backbones
.
SpineNetMobile
(
model_id
=
'49'
,
stochastic_depth_drop_rate
=
0.2
,
min_level
=
3
,
max_level
=
7
,
use_keras_upsampling_2d
=
False
)),
decoder
=
decoders
.
Decoder
(
type
=
'identity'
,
identity
=
decoders
.
Identity
()),
head
=
RetinaNetHead
(
num_filters
=
48
,
use_separable_conv
=
True
),
anchor
=
Anchor
(
anchor_size
=
3
),
norm_activation
=
common
.
NormActivation
(
use_sync_bn
=
True
,
activation
=
'swish'
),
num_classes
=
91
,
input_size
=
[
input_size
,
input_size
,
3
],
min_level
=
3
,
max_level
=
7
),
losses
=
Losses
(
l2_weight_decay
=
3e-5
),
train_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
COCO_INPUT_PATH_BASE
,
'train*'
),
is_training
=
True
,
global_batch_size
=
train_batch_size
,
parser
=
Parser
(
aug_rand_hflip
=
True
,
aug_scale_min
=
0.1
,
aug_scale_max
=
2.0
)),
validation_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
COCO_INPUT_PATH_BASE
,
'val*'
),
is_training
=
False
,
global_batch_size
=
eval_batch_size
)),
trainer
=
cfg
.
TrainerConfig
(
train_steps
=
600
*
steps_per_epoch
,
validation_steps
=
COCO_VAL_EXAMPLES
//
eval_batch_size
,
validation_interval
=
steps_per_epoch
,
steps_per_loop
=
steps_per_epoch
,
summary_interval
=
steps_per_epoch
,
checkpoint_interval
=
steps_per_epoch
,
optimizer_config
=
optimization
.
OptimizationConfig
({
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'stepwise'
,
'stepwise'
:
{
'boundaries'
:
[
575
*
steps_per_epoch
,
590
*
steps_per_epoch
],
'values'
:
[
0.32
*
train_batch_size
/
256.0
,
0.032
*
train_batch_size
/
256.0
,
0.0032
*
train_batch_size
/
256.0
],
}
},
'warmup'
:
{
'type'
:
'linear'
,
'linear'
:
{
'warmup_steps'
:
2000
,
'warmup_learning_rate'
:
0.0067
}
}
})),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
,
])
return
config
official/vision/configs/retinanet_test.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for retinanet."""
# pylint: disable=unused-import
from
absl.testing
import
parameterized
import
tensorflow
as
tf
from
official
import
vision
from
official.core
import
config_definitions
as
cfg
from
official.core
import
exp_factory
from
official.vision.configs
import
retinanet
as
exp_cfg
class
RetinaNetConfigTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
@
parameterized
.
parameters
(
(
'retinanet_resnetfpn_coco'
,),
(
'retinanet_spinenet_coco'
,),
(
'retinanet_mobile_coco'
,),
)
def
test_retinanet_configs
(
self
,
config_name
):
config
=
exp_factory
.
get_exp_config
(
config_name
)
self
.
assertIsInstance
(
config
,
cfg
.
ExperimentConfig
)
self
.
assertIsInstance
(
config
.
task
,
exp_cfg
.
RetinaNetTask
)
self
.
assertIsInstance
(
config
.
task
.
model
,
exp_cfg
.
RetinaNet
)
self
.
assertIsInstance
(
config
.
task
.
train_data
,
exp_cfg
.
DataConfig
)
config
.
validate
()
config
.
task
.
train_data
.
is_training
=
None
with
self
.
assertRaisesRegex
(
KeyError
,
'Found inconsistncy between key'
):
config
.
validate
()
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/configs/semantic_segmentation.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Semantic segmentation configuration definition."""
import
dataclasses
import
os
from
typing
import
List
,
Optional
,
Union
import
numpy
as
np
from
official.core
import
config_definitions
as
cfg
from
official.core
import
exp_factory
from
official.modeling
import
hyperparams
from
official.modeling
import
optimization
from
official.vision.configs
import
common
from
official.vision.configs
import
decoders
from
official.vision.configs
import
backbones
@
dataclasses
.
dataclass
class
DataConfig
(
cfg
.
DataConfig
):
"""Input config for training."""
output_size
:
List
[
int
]
=
dataclasses
.
field
(
default_factory
=
list
)
# If crop_size is specified, image will be resized first to
# output_size, then crop of size crop_size will be cropped.
crop_size
:
List
[
int
]
=
dataclasses
.
field
(
default_factory
=
list
)
input_path
:
str
=
''
global_batch_size
:
int
=
0
is_training
:
bool
=
True
dtype
:
str
=
'float32'
shuffle_buffer_size
:
int
=
1000
cycle_length
:
int
=
10
# If resize_eval_groundtruth is set to False, original image sizes are used
# for eval. In that case, groundtruth_padded_size has to be specified too to
# allow for batching the variable input sizes of images.
resize_eval_groundtruth
:
bool
=
True
groundtruth_padded_size
:
List
[
int
]
=
dataclasses
.
field
(
default_factory
=
list
)
aug_scale_min
:
float
=
1.0
aug_scale_max
:
float
=
1.0
aug_rand_hflip
:
bool
=
True
preserve_aspect_ratio
:
bool
=
True
aug_policy
:
Optional
[
str
]
=
None
drop_remainder
:
bool
=
True
file_type
:
str
=
'tfrecord'
decoder
:
Optional
[
common
.
DataDecoder
]
=
common
.
DataDecoder
()
@
dataclasses
.
dataclass
class
SegmentationHead
(
hyperparams
.
Config
):
"""Segmentation head config."""
level
:
int
=
3
num_convs
:
int
=
2
num_filters
:
int
=
256
use_depthwise_convolution
:
bool
=
False
prediction_kernel_size
:
int
=
1
upsample_factor
:
int
=
1
feature_fusion
:
Optional
[
str
]
=
None
# None, deeplabv3plus, panoptic_fpn_fusion or pyramid_fusion
# deeplabv3plus feature fusion params
low_level
:
Union
[
int
,
str
]
=
2
low_level_num_filters
:
int
=
48
# panoptic_fpn_fusion params
decoder_min_level
:
Optional
[
Union
[
int
,
str
]]
=
None
decoder_max_level
:
Optional
[
Union
[
int
,
str
]]
=
None
@
dataclasses
.
dataclass
class
MaskScoringHead
(
hyperparams
.
Config
):
"""Mask Scoring head config."""
num_convs
:
int
=
4
num_filters
:
int
=
128
fc_input_size
:
List
[
int
]
=
dataclasses
.
field
(
default_factory
=
list
)
num_fcs
:
int
=
2
fc_dims
:
int
=
1024
@
dataclasses
.
dataclass
class
SemanticSegmentationModel
(
hyperparams
.
Config
):
"""Semantic segmentation model config."""
num_classes
:
int
=
0
input_size
:
List
[
int
]
=
dataclasses
.
field
(
default_factory
=
list
)
min_level
:
int
=
3
max_level
:
int
=
6
head
:
SegmentationHead
=
SegmentationHead
()
backbone
:
backbones
.
Backbone
=
backbones
.
Backbone
(
type
=
'resnet'
,
resnet
=
backbones
.
ResNet
())
decoder
:
decoders
.
Decoder
=
decoders
.
Decoder
(
type
=
'identity'
)
mask_scoring_head
:
Optional
[
MaskScoringHead
]
=
None
norm_activation
:
common
.
NormActivation
=
common
.
NormActivation
()
@
dataclasses
.
dataclass
class
Losses
(
hyperparams
.
Config
):
loss_weight
:
float
=
1.0
label_smoothing
:
float
=
0.0
ignore_label
:
int
=
255
class_weights
:
List
[
float
]
=
dataclasses
.
field
(
default_factory
=
list
)
l2_weight_decay
:
float
=
0.0
use_groundtruth_dimension
:
bool
=
True
top_k_percent_pixels
:
float
=
1.0
@
dataclasses
.
dataclass
class
Evaluation
(
hyperparams
.
Config
):
report_per_class_iou
:
bool
=
True
report_train_mean_iou
:
bool
=
True
# Turning this off can speed up training.
@
dataclasses
.
dataclass
class
SemanticSegmentationTask
(
cfg
.
TaskConfig
):
"""The model config."""
model
:
SemanticSegmentationModel
=
SemanticSegmentationModel
()
train_data
:
DataConfig
=
DataConfig
(
is_training
=
True
)
validation_data
:
DataConfig
=
DataConfig
(
is_training
=
False
)
losses
:
Losses
=
Losses
()
evaluation
:
Evaluation
=
Evaluation
()
train_input_partition_dims
:
List
[
int
]
=
dataclasses
.
field
(
default_factory
=
list
)
eval_input_partition_dims
:
List
[
int
]
=
dataclasses
.
field
(
default_factory
=
list
)
init_checkpoint
:
Optional
[
str
]
=
None
init_checkpoint_modules
:
Union
[
str
,
List
[
str
]]
=
'all'
# all, backbone, and/or decoder
@
exp_factory
.
register_config_factory
(
'semantic_segmentation'
)
def
semantic_segmentation
()
->
cfg
.
ExperimentConfig
:
"""Semantic segmentation general."""
return
cfg
.
ExperimentConfig
(
task
=
SemanticSegmentationTask
(),
trainer
=
cfg
.
TrainerConfig
(),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
])
# PASCAL VOC 2012 Dataset
PASCAL_TRAIN_EXAMPLES
=
10582
PASCAL_VAL_EXAMPLES
=
1449
PASCAL_INPUT_PATH_BASE
=
'gs://**/pascal_voc_seg'
@
exp_factory
.
register_config_factory
(
'seg_deeplabv3_pascal'
)
def
seg_deeplabv3_pascal
()
->
cfg
.
ExperimentConfig
:
"""Image segmentation on pascal voc with resnet deeplabv3."""
train_batch_size
=
16
eval_batch_size
=
8
steps_per_epoch
=
PASCAL_TRAIN_EXAMPLES
//
train_batch_size
output_stride
=
16
aspp_dilation_rates
=
[
12
,
24
,
36
]
# [6, 12, 18] if output_stride = 16
multigrid
=
[
1
,
2
,
4
]
stem_type
=
'v1'
level
=
int
(
np
.
math
.
log2
(
output_stride
))
config
=
cfg
.
ExperimentConfig
(
task
=
SemanticSegmentationTask
(
model
=
SemanticSegmentationModel
(
num_classes
=
21
,
input_size
=
[
None
,
None
,
3
],
backbone
=
backbones
.
Backbone
(
type
=
'dilated_resnet'
,
dilated_resnet
=
backbones
.
DilatedResNet
(
model_id
=
101
,
output_stride
=
output_stride
,
multigrid
=
multigrid
,
stem_type
=
stem_type
)),
decoder
=
decoders
.
Decoder
(
type
=
'aspp'
,
aspp
=
decoders
.
ASPP
(
level
=
level
,
dilation_rates
=
aspp_dilation_rates
)),
head
=
SegmentationHead
(
level
=
level
,
num_convs
=
0
),
norm_activation
=
common
.
NormActivation
(
activation
=
'swish'
,
norm_momentum
=
0.9997
,
norm_epsilon
=
1e-3
,
use_sync_bn
=
True
)),
losses
=
Losses
(
l2_weight_decay
=
1e-4
),
train_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
PASCAL_INPUT_PATH_BASE
,
'train_aug*'
),
# TODO(arashwan): test changing size to 513 to match deeplab.
output_size
=
[
512
,
512
],
is_training
=
True
,
global_batch_size
=
train_batch_size
,
aug_scale_min
=
0.5
,
aug_scale_max
=
2.0
),
validation_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
PASCAL_INPUT_PATH_BASE
,
'val*'
),
output_size
=
[
512
,
512
],
is_training
=
False
,
global_batch_size
=
eval_batch_size
,
resize_eval_groundtruth
=
False
,
groundtruth_padded_size
=
[
512
,
512
],
drop_remainder
=
False
),
# resnet101
init_checkpoint
=
'gs://cloud-tpu-checkpoints/vision-2.0/deeplab/deeplab_resnet101_imagenet/ckpt-62400'
,
init_checkpoint_modules
=
'backbone'
),
trainer
=
cfg
.
TrainerConfig
(
steps_per_loop
=
steps_per_epoch
,
summary_interval
=
steps_per_epoch
,
checkpoint_interval
=
steps_per_epoch
,
train_steps
=
45
*
steps_per_epoch
,
validation_steps
=
PASCAL_VAL_EXAMPLES
//
eval_batch_size
,
validation_interval
=
steps_per_epoch
,
optimizer_config
=
optimization
.
OptimizationConfig
({
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'polynomial'
,
'polynomial'
:
{
'initial_learning_rate'
:
0.007
,
'decay_steps'
:
45
*
steps_per_epoch
,
'end_learning_rate'
:
0.0
,
'power'
:
0.9
}
},
'warmup'
:
{
'type'
:
'linear'
,
'linear'
:
{
'warmup_steps'
:
5
*
steps_per_epoch
,
'warmup_learning_rate'
:
0
}
}
})),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
])
return
config
@
exp_factory
.
register_config_factory
(
'seg_deeplabv3plus_pascal'
)
def
seg_deeplabv3plus_pascal
()
->
cfg
.
ExperimentConfig
:
"""Image segmentation on pascal voc with resnet deeplabv3+."""
train_batch_size
=
16
eval_batch_size
=
8
steps_per_epoch
=
PASCAL_TRAIN_EXAMPLES
//
train_batch_size
output_stride
=
16
aspp_dilation_rates
=
[
6
,
12
,
18
]
multigrid
=
[
1
,
2
,
4
]
stem_type
=
'v1'
level
=
int
(
np
.
math
.
log2
(
output_stride
))
config
=
cfg
.
ExperimentConfig
(
task
=
SemanticSegmentationTask
(
model
=
SemanticSegmentationModel
(
num_classes
=
21
,
input_size
=
[
None
,
None
,
3
],
backbone
=
backbones
.
Backbone
(
type
=
'dilated_resnet'
,
dilated_resnet
=
backbones
.
DilatedResNet
(
model_id
=
101
,
output_stride
=
output_stride
,
stem_type
=
stem_type
,
multigrid
=
multigrid
)),
decoder
=
decoders
.
Decoder
(
type
=
'aspp'
,
aspp
=
decoders
.
ASPP
(
level
=
level
,
dilation_rates
=
aspp_dilation_rates
)),
head
=
SegmentationHead
(
level
=
level
,
num_convs
=
2
,
feature_fusion
=
'deeplabv3plus'
,
low_level
=
2
,
low_level_num_filters
=
48
),
norm_activation
=
common
.
NormActivation
(
activation
=
'swish'
,
norm_momentum
=
0.9997
,
norm_epsilon
=
1e-3
,
use_sync_bn
=
True
)),
losses
=
Losses
(
l2_weight_decay
=
1e-4
),
train_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
PASCAL_INPUT_PATH_BASE
,
'train_aug*'
),
output_size
=
[
512
,
512
],
is_training
=
True
,
global_batch_size
=
train_batch_size
,
aug_scale_min
=
0.5
,
aug_scale_max
=
2.0
),
validation_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
PASCAL_INPUT_PATH_BASE
,
'val*'
),
output_size
=
[
512
,
512
],
is_training
=
False
,
global_batch_size
=
eval_batch_size
,
resize_eval_groundtruth
=
False
,
groundtruth_padded_size
=
[
512
,
512
],
drop_remainder
=
False
),
# resnet101
init_checkpoint
=
'gs://cloud-tpu-checkpoints/vision-2.0/deeplab/deeplab_resnet101_imagenet/ckpt-62400'
,
init_checkpoint_modules
=
'backbone'
),
trainer
=
cfg
.
TrainerConfig
(
steps_per_loop
=
steps_per_epoch
,
summary_interval
=
steps_per_epoch
,
checkpoint_interval
=
steps_per_epoch
,
train_steps
=
45
*
steps_per_epoch
,
validation_steps
=
PASCAL_VAL_EXAMPLES
//
eval_batch_size
,
validation_interval
=
steps_per_epoch
,
optimizer_config
=
optimization
.
OptimizationConfig
({
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'polynomial'
,
'polynomial'
:
{
'initial_learning_rate'
:
0.007
,
'decay_steps'
:
45
*
steps_per_epoch
,
'end_learning_rate'
:
0.0
,
'power'
:
0.9
}
},
'warmup'
:
{
'type'
:
'linear'
,
'linear'
:
{
'warmup_steps'
:
5
*
steps_per_epoch
,
'warmup_learning_rate'
:
0
}
}
})),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
])
return
config
@
exp_factory
.
register_config_factory
(
'seg_resnetfpn_pascal'
)
def
seg_resnetfpn_pascal
()
->
cfg
.
ExperimentConfig
:
"""Image segmentation on pascal voc with resnet-fpn."""
train_batch_size
=
256
eval_batch_size
=
32
steps_per_epoch
=
PASCAL_TRAIN_EXAMPLES
//
train_batch_size
config
=
cfg
.
ExperimentConfig
(
task
=
SemanticSegmentationTask
(
model
=
SemanticSegmentationModel
(
num_classes
=
21
,
input_size
=
[
512
,
512
,
3
],
min_level
=
3
,
max_level
=
7
,
backbone
=
backbones
.
Backbone
(
type
=
'resnet'
,
resnet
=
backbones
.
ResNet
(
model_id
=
50
)),
decoder
=
decoders
.
Decoder
(
type
=
'fpn'
,
fpn
=
decoders
.
FPN
()),
head
=
SegmentationHead
(
level
=
3
,
num_convs
=
3
),
norm_activation
=
common
.
NormActivation
(
activation
=
'swish'
,
use_sync_bn
=
True
)),
losses
=
Losses
(
l2_weight_decay
=
1e-4
),
train_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
PASCAL_INPUT_PATH_BASE
,
'train_aug*'
),
is_training
=
True
,
global_batch_size
=
train_batch_size
,
aug_scale_min
=
0.2
,
aug_scale_max
=
1.5
),
validation_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
PASCAL_INPUT_PATH_BASE
,
'val*'
),
is_training
=
False
,
global_batch_size
=
eval_batch_size
,
resize_eval_groundtruth
=
False
,
groundtruth_padded_size
=
[
512
,
512
],
drop_remainder
=
False
),
),
trainer
=
cfg
.
TrainerConfig
(
steps_per_loop
=
steps_per_epoch
,
summary_interval
=
steps_per_epoch
,
checkpoint_interval
=
steps_per_epoch
,
train_steps
=
450
*
steps_per_epoch
,
validation_steps
=
PASCAL_VAL_EXAMPLES
//
eval_batch_size
,
validation_interval
=
steps_per_epoch
,
optimizer_config
=
optimization
.
OptimizationConfig
({
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'polynomial'
,
'polynomial'
:
{
'initial_learning_rate'
:
0.007
,
'decay_steps'
:
450
*
steps_per_epoch
,
'end_learning_rate'
:
0.0
,
'power'
:
0.9
}
},
'warmup'
:
{
'type'
:
'linear'
,
'linear'
:
{
'warmup_steps'
:
5
*
steps_per_epoch
,
'warmup_learning_rate'
:
0
}
}
})),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
])
return
config
@
exp_factory
.
register_config_factory
(
'mnv2_deeplabv3_pascal'
)
def
mnv2_deeplabv3_pascal
()
->
cfg
.
ExperimentConfig
:
"""Image segmentation on pascal with mobilenetv2 deeplabv3."""
train_batch_size
=
16
eval_batch_size
=
16
steps_per_epoch
=
PASCAL_TRAIN_EXAMPLES
//
train_batch_size
output_stride
=
16
aspp_dilation_rates
=
[]
level
=
int
(
np
.
math
.
log2
(
output_stride
))
pool_kernel_size
=
[]
config
=
cfg
.
ExperimentConfig
(
task
=
SemanticSegmentationTask
(
model
=
SemanticSegmentationModel
(
num_classes
=
21
,
input_size
=
[
None
,
None
,
3
],
backbone
=
backbones
.
Backbone
(
type
=
'mobilenet'
,
mobilenet
=
backbones
.
MobileNet
(
model_id
=
'MobileNetV2'
,
output_stride
=
output_stride
)),
decoder
=
decoders
.
Decoder
(
type
=
'aspp'
,
aspp
=
decoders
.
ASPP
(
level
=
level
,
dilation_rates
=
aspp_dilation_rates
,
pool_kernel_size
=
pool_kernel_size
)),
head
=
SegmentationHead
(
level
=
level
,
num_convs
=
0
),
norm_activation
=
common
.
NormActivation
(
activation
=
'relu'
,
norm_momentum
=
0.99
,
norm_epsilon
=
1e-3
,
use_sync_bn
=
True
)),
losses
=
Losses
(
l2_weight_decay
=
4e-5
),
train_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
PASCAL_INPUT_PATH_BASE
,
'train_aug*'
),
output_size
=
[
512
,
512
],
is_training
=
True
,
global_batch_size
=
train_batch_size
,
aug_scale_min
=
0.5
,
aug_scale_max
=
2.0
),
validation_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
PASCAL_INPUT_PATH_BASE
,
'val*'
),
output_size
=
[
512
,
512
],
is_training
=
False
,
global_batch_size
=
eval_batch_size
,
resize_eval_groundtruth
=
False
,
groundtruth_padded_size
=
[
512
,
512
],
drop_remainder
=
False
),
# mobilenetv2
init_checkpoint
=
'gs://tf_model_garden/cloud/vision-2.0/deeplab/deeplabv3_mobilenetv2_coco/best_ckpt-63'
,
init_checkpoint_modules
=
[
'backbone'
,
'decoder'
]),
trainer
=
cfg
.
TrainerConfig
(
steps_per_loop
=
steps_per_epoch
,
summary_interval
=
steps_per_epoch
,
checkpoint_interval
=
steps_per_epoch
,
train_steps
=
30000
,
validation_steps
=
PASCAL_VAL_EXAMPLES
//
eval_batch_size
,
validation_interval
=
steps_per_epoch
,
best_checkpoint_eval_metric
=
'mean_iou'
,
best_checkpoint_export_subdir
=
'best_ckpt'
,
best_checkpoint_metric_comp
=
'higher'
,
optimizer_config
=
optimization
.
OptimizationConfig
({
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'polynomial'
,
'polynomial'
:
{
'initial_learning_rate'
:
0.007
*
train_batch_size
/
16
,
'decay_steps'
:
30000
,
'end_learning_rate'
:
0.0
,
'power'
:
0.9
}
},
'warmup'
:
{
'type'
:
'linear'
,
'linear'
:
{
'warmup_steps'
:
5
*
steps_per_epoch
,
'warmup_learning_rate'
:
0
}
}
})),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
])
return
config
# Cityscapes Dataset (Download and process the dataset yourself)
CITYSCAPES_TRAIN_EXAMPLES
=
2975
CITYSCAPES_VAL_EXAMPLES
=
500
CITYSCAPES_INPUT_PATH_BASE
=
'cityscapes'
@
exp_factory
.
register_config_factory
(
'seg_deeplabv3plus_cityscapes'
)
def
seg_deeplabv3plus_cityscapes
()
->
cfg
.
ExperimentConfig
:
"""Image segmentation on cityscapes with resnet deeplabv3+."""
train_batch_size
=
16
eval_batch_size
=
16
steps_per_epoch
=
CITYSCAPES_TRAIN_EXAMPLES
//
train_batch_size
output_stride
=
16
aspp_dilation_rates
=
[
6
,
12
,
18
]
multigrid
=
[
1
,
2
,
4
]
stem_type
=
'v1'
level
=
int
(
np
.
math
.
log2
(
output_stride
))
config
=
cfg
.
ExperimentConfig
(
task
=
SemanticSegmentationTask
(
model
=
SemanticSegmentationModel
(
# Cityscapes uses only 19 semantic classes for train/evaluation.
# The void (background) class is ignored in train and evaluation.
num_classes
=
19
,
input_size
=
[
None
,
None
,
3
],
backbone
=
backbones
.
Backbone
(
type
=
'dilated_resnet'
,
dilated_resnet
=
backbones
.
DilatedResNet
(
model_id
=
101
,
output_stride
=
output_stride
,
stem_type
=
stem_type
,
multigrid
=
multigrid
)),
decoder
=
decoders
.
Decoder
(
type
=
'aspp'
,
aspp
=
decoders
.
ASPP
(
level
=
level
,
dilation_rates
=
aspp_dilation_rates
,
pool_kernel_size
=
[
512
,
1024
])),
head
=
SegmentationHead
(
level
=
level
,
num_convs
=
2
,
feature_fusion
=
'deeplabv3plus'
,
low_level
=
2
,
low_level_num_filters
=
48
),
norm_activation
=
common
.
NormActivation
(
activation
=
'swish'
,
norm_momentum
=
0.99
,
norm_epsilon
=
1e-3
,
use_sync_bn
=
True
)),
losses
=
Losses
(
l2_weight_decay
=
1e-4
),
train_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
CITYSCAPES_INPUT_PATH_BASE
,
'train_fine**'
),
crop_size
=
[
512
,
1024
],
output_size
=
[
1024
,
2048
],
is_training
=
True
,
global_batch_size
=
train_batch_size
,
aug_scale_min
=
0.5
,
aug_scale_max
=
2.0
),
validation_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
CITYSCAPES_INPUT_PATH_BASE
,
'val_fine*'
),
output_size
=
[
1024
,
2048
],
is_training
=
False
,
global_batch_size
=
eval_batch_size
,
resize_eval_groundtruth
=
True
,
drop_remainder
=
False
),
# resnet101
init_checkpoint
=
'gs://cloud-tpu-checkpoints/vision-2.0/deeplab/deeplab_resnet101_imagenet/ckpt-62400'
,
init_checkpoint_modules
=
'backbone'
),
trainer
=
cfg
.
TrainerConfig
(
steps_per_loop
=
steps_per_epoch
,
summary_interval
=
steps_per_epoch
,
checkpoint_interval
=
steps_per_epoch
,
train_steps
=
500
*
steps_per_epoch
,
validation_steps
=
CITYSCAPES_VAL_EXAMPLES
//
eval_batch_size
,
validation_interval
=
steps_per_epoch
,
optimizer_config
=
optimization
.
OptimizationConfig
({
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'polynomial'
,
'polynomial'
:
{
'initial_learning_rate'
:
0.01
,
'decay_steps'
:
500
*
steps_per_epoch
,
'end_learning_rate'
:
0.0
,
'power'
:
0.9
}
},
'warmup'
:
{
'type'
:
'linear'
,
'linear'
:
{
'warmup_steps'
:
5
*
steps_per_epoch
,
'warmup_learning_rate'
:
0
}
}
})),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
])
return
config
@
exp_factory
.
register_config_factory
(
'mnv2_deeplabv3_cityscapes'
)
def
mnv2_deeplabv3_cityscapes
()
->
cfg
.
ExperimentConfig
:
"""Image segmentation on cityscapes with mobilenetv2 deeplabv3."""
train_batch_size
=
16
eval_batch_size
=
16
steps_per_epoch
=
CITYSCAPES_TRAIN_EXAMPLES
//
train_batch_size
output_stride
=
16
aspp_dilation_rates
=
[]
pool_kernel_size
=
[
512
,
1024
]
level
=
int
(
np
.
math
.
log2
(
output_stride
))
config
=
cfg
.
ExperimentConfig
(
task
=
SemanticSegmentationTask
(
model
=
SemanticSegmentationModel
(
# Cityscapes uses only 19 semantic classes for train/evaluation.
# The void (background) class is ignored in train and evaluation.
num_classes
=
19
,
input_size
=
[
None
,
None
,
3
],
backbone
=
backbones
.
Backbone
(
type
=
'mobilenet'
,
mobilenet
=
backbones
.
MobileNet
(
model_id
=
'MobileNetV2'
,
output_stride
=
output_stride
)),
decoder
=
decoders
.
Decoder
(
type
=
'aspp'
,
aspp
=
decoders
.
ASPP
(
level
=
level
,
dilation_rates
=
aspp_dilation_rates
,
pool_kernel_size
=
pool_kernel_size
)),
head
=
SegmentationHead
(
level
=
level
,
num_convs
=
0
),
norm_activation
=
common
.
NormActivation
(
activation
=
'relu'
,
norm_momentum
=
0.99
,
norm_epsilon
=
1e-3
,
use_sync_bn
=
True
)),
losses
=
Losses
(
l2_weight_decay
=
4e-5
),
train_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
CITYSCAPES_INPUT_PATH_BASE
,
'train_fine**'
),
crop_size
=
[
512
,
1024
],
output_size
=
[
1024
,
2048
],
is_training
=
True
,
global_batch_size
=
train_batch_size
,
aug_scale_min
=
0.5
,
aug_scale_max
=
2.0
),
validation_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
CITYSCAPES_INPUT_PATH_BASE
,
'val_fine*'
),
output_size
=
[
1024
,
2048
],
is_training
=
False
,
global_batch_size
=
eval_batch_size
,
resize_eval_groundtruth
=
True
,
drop_remainder
=
False
),
# Coco pre-trained mobilenetv2 checkpoint
init_checkpoint
=
'gs://tf_model_garden/cloud/vision-2.0/deeplab/deeplabv3_mobilenetv2_coco/best_ckpt-63'
,
init_checkpoint_modules
=
'backbone'
),
trainer
=
cfg
.
TrainerConfig
(
steps_per_loop
=
steps_per_epoch
,
summary_interval
=
steps_per_epoch
,
checkpoint_interval
=
steps_per_epoch
,
train_steps
=
100000
,
validation_steps
=
CITYSCAPES_VAL_EXAMPLES
//
eval_batch_size
,
validation_interval
=
steps_per_epoch
,
best_checkpoint_eval_metric
=
'mean_iou'
,
best_checkpoint_export_subdir
=
'best_ckpt'
,
best_checkpoint_metric_comp
=
'higher'
,
optimizer_config
=
optimization
.
OptimizationConfig
({
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'polynomial'
,
'polynomial'
:
{
'initial_learning_rate'
:
0.01
,
'decay_steps'
:
100000
,
'end_learning_rate'
:
0.0
,
'power'
:
0.9
}
},
'warmup'
:
{
'type'
:
'linear'
,
'linear'
:
{
'warmup_steps'
:
5
*
steps_per_epoch
,
'warmup_learning_rate'
:
0
}
}
})),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
])
return
config
@
exp_factory
.
register_config_factory
(
'mnv2_deeplabv3plus_cityscapes'
)
def
mnv2_deeplabv3plus_cityscapes
()
->
cfg
.
ExperimentConfig
:
"""Image segmentation on cityscapes with mobilenetv2 deeplabv3plus."""
config
=
mnv2_deeplabv3_cityscapes
()
config
.
task
.
model
.
head
=
SegmentationHead
(
level
=
4
,
num_convs
=
2
,
feature_fusion
=
'deeplabv3plus'
,
use_depthwise_convolution
=
True
,
low_level
=
'2/depthwise'
,
low_level_num_filters
=
48
)
config
.
task
.
model
.
backbone
.
mobilenet
.
output_intermediate_endpoints
=
True
return
config
official/vision/configs/semantic_segmentation_test.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for semantic_segmentation."""
# pylint: disable=unused-import
from
absl.testing
import
parameterized
import
tensorflow
as
tf
from
official
import
vision
from
official.core
import
config_definitions
as
cfg
from
official.core
import
exp_factory
from
official.vision.configs
import
semantic_segmentation
as
exp_cfg
class
ImageSegmentationConfigTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
@
parameterized
.
parameters
((
'seg_deeplabv3_pascal'
,),
(
'seg_deeplabv3plus_pascal'
,))
def
test_semantic_segmentation_configs
(
self
,
config_name
):
config
=
exp_factory
.
get_exp_config
(
config_name
)
self
.
assertIsInstance
(
config
,
cfg
.
ExperimentConfig
)
self
.
assertIsInstance
(
config
.
task
,
exp_cfg
.
SemanticSegmentationTask
)
self
.
assertIsInstance
(
config
.
task
.
model
,
exp_cfg
.
SemanticSegmentationModel
)
self
.
assertIsInstance
(
config
.
task
.
train_data
,
exp_cfg
.
DataConfig
)
config
.
validate
()
config
.
task
.
train_data
.
is_training
=
None
with
self
.
assertRaises
(
KeyError
):
config
.
validate
()
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/configs/video_classification.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Video classification configuration definition."""
import
dataclasses
from
typing
import
Optional
,
Tuple
from
official.core
import
config_definitions
as
cfg
from
official.core
import
exp_factory
from
official.modeling
import
hyperparams
from
official.modeling
import
optimization
from
official.vision.configs
import
backbones_3d
from
official.vision.configs
import
common
@
dataclasses
.
dataclass
class
DataConfig
(
cfg
.
DataConfig
):
"""The base configuration for building datasets."""
name
:
Optional
[
str
]
=
None
file_type
:
Optional
[
str
]
=
'tfrecord'
compressed_input
:
bool
=
False
split
:
str
=
'train'
variant_name
:
Optional
[
str
]
=
None
feature_shape
:
Tuple
[
int
,
...]
=
(
64
,
224
,
224
,
3
)
temporal_stride
:
int
=
1
random_stride_range
:
int
=
0
num_test_clips
:
int
=
1
num_test_crops
:
int
=
1
num_classes
:
int
=
-
1
num_examples
:
int
=
-
1
global_batch_size
:
int
=
128
data_format
:
str
=
'channels_last'
dtype
:
str
=
'float32'
one_hot
:
bool
=
True
shuffle_buffer_size
:
int
=
64
cache
:
bool
=
False
input_path
:
str
=
''
is_training
:
bool
=
True
cycle_length
:
int
=
10
drop_remainder
:
bool
=
True
min_image_size
:
int
=
256
is_multilabel
:
bool
=
False
output_audio
:
bool
=
False
audio_feature
:
str
=
''
audio_feature_shape
:
Tuple
[
int
,
...]
=
(
-
1
,)
aug_min_aspect_ratio
:
float
=
0.5
aug_max_aspect_ratio
:
float
=
2.0
aug_min_area_ratio
:
float
=
0.49
aug_max_area_ratio
:
float
=
1.0
aug_type
:
Optional
[
str
]
=
None
# 'autoaug', 'randaug', or None
image_field_key
:
str
=
'image/encoded'
label_field_key
:
str
=
'clip/label/index'
def
kinetics400
(
is_training
):
"""Generated Kinectics 400 dataset configs."""
return
DataConfig
(
name
=
'kinetics400'
,
num_classes
=
400
,
is_training
=
is_training
,
split
=
'train'
if
is_training
else
'valid'
,
drop_remainder
=
is_training
,
num_examples
=
215570
if
is_training
else
17706
,
feature_shape
=
(
64
,
224
,
224
,
3
)
if
is_training
else
(
250
,
224
,
224
,
3
))
def
kinetics600
(
is_training
):
"""Generated Kinectics 600 dataset configs."""
return
DataConfig
(
name
=
'kinetics600'
,
num_classes
=
600
,
is_training
=
is_training
,
split
=
'train'
if
is_training
else
'valid'
,
drop_remainder
=
is_training
,
num_examples
=
366016
if
is_training
else
27780
,
feature_shape
=
(
64
,
224
,
224
,
3
)
if
is_training
else
(
250
,
224
,
224
,
3
))
def
kinetics700
(
is_training
):
"""Generated Kinectics 600 dataset configs."""
return
DataConfig
(
name
=
'kinetics700'
,
num_classes
=
700
,
is_training
=
is_training
,
split
=
'train'
if
is_training
else
'valid'
,
drop_remainder
=
is_training
,
num_examples
=
522883
if
is_training
else
33441
,
feature_shape
=
(
64
,
224
,
224
,
3
)
if
is_training
else
(
250
,
224
,
224
,
3
))
def
kinetics700_2020
(
is_training
):
"""Generated Kinectics 600 dataset configs."""
return
DataConfig
(
name
=
'kinetics700'
,
num_classes
=
700
,
is_training
=
is_training
,
split
=
'train'
if
is_training
else
'valid'
,
drop_remainder
=
is_training
,
num_examples
=
535982
if
is_training
else
33640
,
feature_shape
=
(
64
,
224
,
224
,
3
)
if
is_training
else
(
250
,
224
,
224
,
3
))
@
dataclasses
.
dataclass
class
VideoClassificationModel
(
hyperparams
.
Config
):
"""The model config."""
model_type
:
str
=
'video_classification'
backbone
:
backbones_3d
.
Backbone3D
=
backbones_3d
.
Backbone3D
(
type
=
'resnet_3d'
,
resnet_3d
=
backbones_3d
.
ResNet3D50
())
norm_activation
:
common
.
NormActivation
=
common
.
NormActivation
(
use_sync_bn
=
False
)
dropout_rate
:
float
=
0.2
aggregate_endpoints
:
bool
=
False
require_endpoints
:
Optional
[
Tuple
[
str
,
...]]
=
None
@
dataclasses
.
dataclass
class
Losses
(
hyperparams
.
Config
):
one_hot
:
bool
=
True
label_smoothing
:
float
=
0.0
l2_weight_decay
:
float
=
0.0
@
dataclasses
.
dataclass
class
Metrics
(
hyperparams
.
Config
):
use_per_class_recall
:
bool
=
False
@
dataclasses
.
dataclass
class
VideoClassificationTask
(
cfg
.
TaskConfig
):
"""The task config."""
model
:
VideoClassificationModel
=
VideoClassificationModel
()
train_data
:
DataConfig
=
DataConfig
(
is_training
=
True
,
drop_remainder
=
True
)
validation_data
:
DataConfig
=
DataConfig
(
is_training
=
False
,
drop_remainder
=
False
)
losses
:
Losses
=
Losses
()
metrics
:
Metrics
=
Metrics
()
init_checkpoint
:
Optional
[
str
]
=
None
init_checkpoint_modules
:
str
=
'all'
# all or backbone
# Spatial Partitioning fields.
train_input_partition_dims
:
Optional
[
Tuple
[
int
,
...]]
=
None
eval_input_partition_dims
:
Optional
[
Tuple
[
int
,
...]]
=
None
def
add_trainer
(
experiment
:
cfg
.
ExperimentConfig
,
train_batch_size
:
int
,
eval_batch_size
:
int
,
learning_rate
:
float
=
1.6
,
train_epochs
:
int
=
44
,
warmup_epochs
:
int
=
5
):
"""Add and config a trainer to the experiment config."""
if
experiment
.
task
.
train_data
.
num_examples
<=
0
:
raise
ValueError
(
'Wrong train dataset size {!r}'
.
format
(
experiment
.
task
.
train_data
))
if
experiment
.
task
.
validation_data
.
num_examples
<=
0
:
raise
ValueError
(
'Wrong validation dataset size {!r}'
.
format
(
experiment
.
task
.
validation_data
))
experiment
.
task
.
train_data
.
global_batch_size
=
train_batch_size
experiment
.
task
.
validation_data
.
global_batch_size
=
eval_batch_size
steps_per_epoch
=
experiment
.
task
.
train_data
.
num_examples
//
train_batch_size
experiment
.
trainer
=
cfg
.
TrainerConfig
(
steps_per_loop
=
steps_per_epoch
,
summary_interval
=
steps_per_epoch
,
checkpoint_interval
=
steps_per_epoch
,
train_steps
=
train_epochs
*
steps_per_epoch
,
validation_steps
=
experiment
.
task
.
validation_data
.
num_examples
//
eval_batch_size
,
validation_interval
=
steps_per_epoch
,
optimizer_config
=
optimization
.
OptimizationConfig
({
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
,
'nesterov'
:
True
,
}
},
'learning_rate'
:
{
'type'
:
'cosine'
,
'cosine'
:
{
'initial_learning_rate'
:
learning_rate
,
'decay_steps'
:
train_epochs
*
steps_per_epoch
,
}
},
'warmup'
:
{
'type'
:
'linear'
,
'linear'
:
{
'warmup_steps'
:
warmup_epochs
*
steps_per_epoch
,
'warmup_learning_rate'
:
0
}
}
}))
return
experiment
@
exp_factory
.
register_config_factory
(
'video_classification'
)
def
video_classification
()
->
cfg
.
ExperimentConfig
:
"""Video classification general."""
return
cfg
.
ExperimentConfig
(
runtime
=
cfg
.
RuntimeConfig
(
mixed_precision_dtype
=
'bfloat16'
),
task
=
VideoClassificationTask
(),
trainer
=
cfg
.
TrainerConfig
(),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
,
'task.train_data.num_classes == task.validation_data.num_classes'
,
])
@
exp_factory
.
register_config_factory
(
'video_classification_ucf101'
)
def
video_classification_ucf101
()
->
cfg
.
ExperimentConfig
:
"""Video classification on UCF-101 with resnet."""
train_dataset
=
DataConfig
(
name
=
'ucf101'
,
num_classes
=
101
,
is_training
=
True
,
split
=
'train'
,
drop_remainder
=
True
,
num_examples
=
9537
,
temporal_stride
=
2
,
feature_shape
=
(
32
,
224
,
224
,
3
))
train_dataset
.
tfds_name
=
'ucf101'
train_dataset
.
tfds_split
=
'train'
validation_dataset
=
DataConfig
(
name
=
'ucf101'
,
num_classes
=
101
,
is_training
=
True
,
split
=
'test'
,
drop_remainder
=
False
,
num_examples
=
3783
,
temporal_stride
=
2
,
feature_shape
=
(
32
,
224
,
224
,
3
))
validation_dataset
.
tfds_name
=
'ucf101'
validation_dataset
.
tfds_split
=
'test'
task
=
VideoClassificationTask
(
model
=
VideoClassificationModel
(
backbone
=
backbones_3d
.
Backbone3D
(
type
=
'resnet_3d'
,
resnet_3d
=
backbones_3d
.
ResNet3D50
()),
norm_activation
=
common
.
NormActivation
(
norm_momentum
=
0.9
,
norm_epsilon
=
1e-5
,
use_sync_bn
=
False
)),
losses
=
Losses
(
l2_weight_decay
=
1e-4
),
train_data
=
train_dataset
,
validation_data
=
validation_dataset
)
config
=
cfg
.
ExperimentConfig
(
runtime
=
cfg
.
RuntimeConfig
(
mixed_precision_dtype
=
'bfloat16'
),
task
=
task
,
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
,
'task.train_data.num_classes == task.validation_data.num_classes'
,
])
add_trainer
(
config
,
train_batch_size
=
64
,
eval_batch_size
=
16
,
learning_rate
=
0.8
,
train_epochs
=
100
)
return
config
@
exp_factory
.
register_config_factory
(
'video_classification_kinetics400'
)
def
video_classification_kinetics400
()
->
cfg
.
ExperimentConfig
:
"""Video classification on Kinectics 400 with resnet."""
train_dataset
=
kinetics400
(
is_training
=
True
)
validation_dataset
=
kinetics400
(
is_training
=
False
)
task
=
VideoClassificationTask
(
model
=
VideoClassificationModel
(
backbone
=
backbones_3d
.
Backbone3D
(
type
=
'resnet_3d'
,
resnet_3d
=
backbones_3d
.
ResNet3D50
()),
norm_activation
=
common
.
NormActivation
(
norm_momentum
=
0.9
,
norm_epsilon
=
1e-5
,
use_sync_bn
=
False
)),
losses
=
Losses
(
l2_weight_decay
=
1e-4
),
train_data
=
train_dataset
,
validation_data
=
validation_dataset
)
config
=
cfg
.
ExperimentConfig
(
runtime
=
cfg
.
RuntimeConfig
(
mixed_precision_dtype
=
'bfloat16'
),
task
=
task
,
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
,
'task.train_data.num_classes == task.validation_data.num_classes'
,
])
add_trainer
(
config
,
train_batch_size
=
1024
,
eval_batch_size
=
64
)
return
config
@
exp_factory
.
register_config_factory
(
'video_classification_kinetics600'
)
def
video_classification_kinetics600
()
->
cfg
.
ExperimentConfig
:
"""Video classification on Kinectics 600 with resnet."""
train_dataset
=
kinetics600
(
is_training
=
True
)
validation_dataset
=
kinetics600
(
is_training
=
False
)
task
=
VideoClassificationTask
(
model
=
VideoClassificationModel
(
backbone
=
backbones_3d
.
Backbone3D
(
type
=
'resnet_3d'
,
resnet_3d
=
backbones_3d
.
ResNet3D50
()),
norm_activation
=
common
.
NormActivation
(
norm_momentum
=
0.9
,
norm_epsilon
=
1e-5
,
use_sync_bn
=
False
)),
losses
=
Losses
(
l2_weight_decay
=
1e-4
),
train_data
=
train_dataset
,
validation_data
=
validation_dataset
)
config
=
cfg
.
ExperimentConfig
(
runtime
=
cfg
.
RuntimeConfig
(
mixed_precision_dtype
=
'bfloat16'
),
task
=
task
,
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
,
'task.train_data.num_classes == task.validation_data.num_classes'
,
])
add_trainer
(
config
,
train_batch_size
=
1024
,
eval_batch_size
=
64
)
return
config
@
exp_factory
.
register_config_factory
(
'video_classification_kinetics700'
)
def
video_classification_kinetics700
()
->
cfg
.
ExperimentConfig
:
"""Video classification on Kinectics 700 with resnet."""
train_dataset
=
kinetics700
(
is_training
=
True
)
validation_dataset
=
kinetics700
(
is_training
=
False
)
task
=
VideoClassificationTask
(
model
=
VideoClassificationModel
(
backbone
=
backbones_3d
.
Backbone3D
(
type
=
'resnet_3d'
,
resnet_3d
=
backbones_3d
.
ResNet3D50
()),
norm_activation
=
common
.
NormActivation
(
norm_momentum
=
0.9
,
norm_epsilon
=
1e-5
,
use_sync_bn
=
False
)),
losses
=
Losses
(
l2_weight_decay
=
1e-4
),
train_data
=
train_dataset
,
validation_data
=
validation_dataset
)
config
=
cfg
.
ExperimentConfig
(
runtime
=
cfg
.
RuntimeConfig
(
mixed_precision_dtype
=
'bfloat16'
),
task
=
task
,
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
,
'task.train_data.num_classes == task.validation_data.num_classes'
,
])
add_trainer
(
config
,
train_batch_size
=
1024
,
eval_batch_size
=
64
)
return
config
@
exp_factory
.
register_config_factory
(
'video_classification_kinetics700_2020'
)
def
video_classification_kinetics700_2020
()
->
cfg
.
ExperimentConfig
:
"""Video classification on Kinectics 700 2020 with resnet."""
train_dataset
=
kinetics700_2020
(
is_training
=
True
)
validation_dataset
=
kinetics700_2020
(
is_training
=
False
)
task
=
VideoClassificationTask
(
model
=
VideoClassificationModel
(
backbone
=
backbones_3d
.
Backbone3D
(
type
=
'resnet_3d'
,
resnet_3d
=
backbones_3d
.
ResNet3D50
()),
norm_activation
=
common
.
NormActivation
(
norm_momentum
=
0.9
,
norm_epsilon
=
1e-5
,
use_sync_bn
=
False
)),
losses
=
Losses
(
l2_weight_decay
=
1e-4
),
train_data
=
train_dataset
,
validation_data
=
validation_dataset
)
config
=
cfg
.
ExperimentConfig
(
runtime
=
cfg
.
RuntimeConfig
(
mixed_precision_dtype
=
'bfloat16'
),
task
=
task
,
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
,
'task.train_data.num_classes == task.validation_data.num_classes'
,
])
add_trainer
(
config
,
train_batch_size
=
1024
,
eval_batch_size
=
64
)
return
config
official/vision/configs/video_classification_test.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for video_classification."""
# pylint: disable=unused-import
from
absl.testing
import
parameterized
import
tensorflow
as
tf
from
official
import
vision
from
official.core
import
config_definitions
as
cfg
from
official.core
import
exp_factory
from
official.vision.configs
import
video_classification
as
exp_cfg
class
VideoClassificationConfigTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
@
parameterized
.
parameters
((
'video_classification'
,),
(
'video_classification_kinetics600'
,))
def
test_video_classification_configs
(
self
,
config_name
):
config
=
exp_factory
.
get_exp_config
(
config_name
)
self
.
assertIsInstance
(
config
,
cfg
.
ExperimentConfig
)
self
.
assertIsInstance
(
config
.
task
,
exp_cfg
.
VideoClassificationTask
)
self
.
assertIsInstance
(
config
.
task
.
model
,
exp_cfg
.
VideoClassificationModel
)
self
.
assertIsInstance
(
config
.
task
.
train_data
,
exp_cfg
.
DataConfig
)
config
.
validate
()
config
.
task
.
train_data
.
is_training
=
None
with
self
.
assertRaises
(
KeyError
):
config
.
validate
()
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/data/__init__.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
official/vision/data/create_coco_tf_record.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
r
"""Convert raw COCO dataset to TFRecord format.
This scripts follows the label map decoder format and supports detection
boxes, instance masks and captions.
Example usage:
python create_coco_tf_record.py --logtostderr \
--image_dir="${TRAIN_IMAGE_DIR}" \
--image_info_file="${TRAIN_IMAGE_INFO_FILE}" \
--object_annotations_file="${TRAIN_ANNOTATIONS_FILE}" \
--caption_annotations_file="${CAPTION_ANNOTATIONS_FILE}" \
--output_file_prefix="${OUTPUT_DIR/FILE_PREFIX}" \
--num_shards=100
"""
import
collections
import
json
import
logging
import
os
from
absl
import
app
# pylint:disable=unused-import
from
absl
import
flags
import
numpy
as
np
from
pycocotools
import
mask
import
tensorflow
as
tf
import
multiprocessing
as
mp
from
official.vision.data
import
tfrecord_lib
flags
.
DEFINE_boolean
(
'include_masks'
,
False
,
'Whether to include instance segmentations masks '
'(PNG encoded) in the result. default: False.'
)
flags
.
DEFINE_multi_string
(
'image_dir'
,
''
,
'Directory containing images.'
)
flags
.
DEFINE_string
(
'image_info_file'
,
''
,
'File containing image information. '
'Tf Examples in the output files correspond to the image '
'info entries in this file. If this file is not provided '
'object_annotations_file is used if present. Otherwise, '
'caption_annotations_file is used to get image info.'
)
flags
.
DEFINE_string
(
'object_annotations_file'
,
''
,
'File containing object '
'annotations - boxes and instance masks.'
)
flags
.
DEFINE_string
(
'caption_annotations_file'
,
''
,
'File containing image '
'captions.'
)
flags
.
DEFINE_string
(
'panoptic_annotations_file'
,
''
,
'File containing panoptic '
'annotations.'
)
flags
.
DEFINE_string
(
'panoptic_masks_dir'
,
''
,
'Directory containing panoptic masks annotations.'
)
flags
.
DEFINE_boolean
(
'include_panoptic_masks'
,
False
,
'Whether to include category and '
'instance masks in the result. These are required to run the PQ evaluator '
'default: False.'
)
flags
.
DEFINE_string
(
'output_file_prefix'
,
'/tmp/train'
,
'Path to output file'
)
flags
.
DEFINE_integer
(
'num_shards'
,
32
,
'Number of shards for output file.'
)
FLAGS
=
flags
.
FLAGS
logger
=
tf
.
get_logger
()
logger
.
setLevel
(
logging
.
INFO
)
_VOID_LABEL
=
0
_VOID_INSTANCE_ID
=
0
_THING_CLASS_ID
=
1
_STUFF_CLASSES_OFFSET
=
90
def
coco_segmentation_to_mask_png
(
segmentation
,
height
,
width
,
is_crowd
):
"""Encode a COCO mask segmentation as PNG string."""
run_len_encoding
=
mask
.
frPyObjects
(
segmentation
,
height
,
width
)
binary_mask
=
mask
.
decode
(
run_len_encoding
)
if
not
is_crowd
:
binary_mask
=
np
.
amax
(
binary_mask
,
axis
=
2
)
return
tfrecord_lib
.
encode_mask_as_png
(
binary_mask
)
def
generate_coco_panoptics_masks
(
segments_info
,
mask_path
,
include_panoptic_masks
,
is_category_thing
):
"""Creates masks for panoptic segmentation task.
Args:
segments_info: a list of dicts, where each dict has keys: [u'id',
u'category_id', u'area', u'bbox', u'iscrowd'], detailing information for
each segment in the panoptic mask.
mask_path: path to the panoptic mask.
include_panoptic_masks: bool, when set to True, category and instance
masks are included in the outputs. Set this to True, when using
the Panoptic Quality evaluator.
is_category_thing: a dict with category ids as keys and, 0/1 as values to
represent "stuff" and "things" classes respectively.
Returns:
A dict with with keys: [u'semantic_segmentation_mask', u'category_mask',
u'instance_mask']. The dict contains 'category_mask' and 'instance_mask'
only if `include_panoptic_eval_masks` is set to True.
"""
rgb_mask
=
tfrecord_lib
.
read_image
(
mask_path
)
r
,
g
,
b
=
np
.
split
(
rgb_mask
,
3
,
axis
=-
1
)
# decode rgb encoded panoptic mask to get segments ids
# refer https://cocodataset.org/#format-data
segments_encoded_mask
=
(
r
+
g
*
256
+
b
*
(
256
**
2
)).
squeeze
()
semantic_segmentation_mask
=
np
.
ones_like
(
segments_encoded_mask
,
dtype
=
np
.
uint8
)
*
_VOID_LABEL
if
include_panoptic_masks
:
category_mask
=
np
.
ones_like
(
segments_encoded_mask
,
dtype
=
np
.
uint8
)
*
_VOID_LABEL
instance_mask
=
np
.
ones_like
(
segments_encoded_mask
,
dtype
=
np
.
uint8
)
*
_VOID_INSTANCE_ID
for
idx
,
segment
in
enumerate
(
segments_info
):
segment_id
=
segment
[
'id'
]
category_id
=
segment
[
'category_id'
]
if
is_category_thing
[
category_id
]:
encoded_category_id
=
_THING_CLASS_ID
instance_id
=
idx
+
1
else
:
encoded_category_id
=
category_id
-
_STUFF_CLASSES_OFFSET
instance_id
=
_VOID_INSTANCE_ID
segment_mask
=
(
segments_encoded_mask
==
segment_id
)
semantic_segmentation_mask
[
segment_mask
]
=
encoded_category_id
if
include_panoptic_masks
:
category_mask
[
segment_mask
]
=
category_id
instance_mask
[
segment_mask
]
=
instance_id
outputs
=
{
'semantic_segmentation_mask'
:
tfrecord_lib
.
encode_mask_as_png
(
semantic_segmentation_mask
)
}
if
include_panoptic_masks
:
outputs
.
update
({
'category_mask'
:
tfrecord_lib
.
encode_mask_as_png
(
category_mask
),
'instance_mask'
:
tfrecord_lib
.
encode_mask_as_png
(
instance_mask
)
})
return
outputs
def
coco_annotations_to_lists
(
bbox_annotations
,
id_to_name_map
,
image_height
,
image_width
,
include_masks
):
"""Converts COCO annotations to feature lists."""
data
=
dict
((
k
,
list
())
for
k
in
[
'xmin'
,
'xmax'
,
'ymin'
,
'ymax'
,
'is_crowd'
,
'category_id'
,
'category_names'
,
'area'
])
if
include_masks
:
data
[
'encoded_mask_png'
]
=
[]
num_annotations_skipped
=
0
for
object_annotations
in
bbox_annotations
:
(
x
,
y
,
width
,
height
)
=
tuple
(
object_annotations
[
'bbox'
])
if
width
<=
0
or
height
<=
0
:
num_annotations_skipped
+=
1
continue
if
x
+
width
>
image_width
or
y
+
height
>
image_height
:
num_annotations_skipped
+=
1
continue
data
[
'xmin'
].
append
(
float
(
x
)
/
image_width
)
data
[
'xmax'
].
append
(
float
(
x
+
width
)
/
image_width
)
data
[
'ymin'
].
append
(
float
(
y
)
/
image_height
)
data
[
'ymax'
].
append
(
float
(
y
+
height
)
/
image_height
)
data
[
'is_crowd'
].
append
(
object_annotations
[
'iscrowd'
])
category_id
=
int
(
object_annotations
[
'category_id'
])
data
[
'category_id'
].
append
(
category_id
)
data
[
'category_names'
].
append
(
id_to_name_map
[
category_id
].
encode
(
'utf8'
))
data
[
'area'
].
append
(
object_annotations
[
'area'
])
if
include_masks
:
data
[
'encoded_mask_png'
].
append
(
coco_segmentation_to_mask_png
(
object_annotations
[
'segmentation'
],
image_height
,
image_width
,
object_annotations
[
'iscrowd'
])
)
return
data
,
num_annotations_skipped
def
bbox_annotations_to_feature_dict
(
bbox_annotations
,
image_height
,
image_width
,
id_to_name_map
,
include_masks
):
"""Convert COCO annotations to an encoded feature dict."""
data
,
num_skipped
=
coco_annotations_to_lists
(
bbox_annotations
,
id_to_name_map
,
image_height
,
image_width
,
include_masks
)
feature_dict
=
{
'image/object/bbox/xmin'
:
tfrecord_lib
.
convert_to_feature
(
data
[
'xmin'
]),
'image/object/bbox/xmax'
:
tfrecord_lib
.
convert_to_feature
(
data
[
'xmax'
]),
'image/object/bbox/ymin'
:
tfrecord_lib
.
convert_to_feature
(
data
[
'ymin'
]),
'image/object/bbox/ymax'
:
tfrecord_lib
.
convert_to_feature
(
data
[
'ymax'
]),
'image/object/class/text'
:
tfrecord_lib
.
convert_to_feature
(
data
[
'category_names'
]),
'image/object/class/label'
:
tfrecord_lib
.
convert_to_feature
(
data
[
'category_id'
]),
'image/object/is_crowd'
:
tfrecord_lib
.
convert_to_feature
(
data
[
'is_crowd'
]),
'image/object/area'
:
tfrecord_lib
.
convert_to_feature
(
data
[
'area'
]),
}
if
include_masks
:
feature_dict
[
'image/object/mask'
]
=
(
tfrecord_lib
.
convert_to_feature
(
data
[
'encoded_mask_png'
]))
return
feature_dict
,
num_skipped
def
encode_caption_annotations
(
caption_annotations
):
captions
=
[]
for
caption_annotation
in
caption_annotations
:
captions
.
append
(
caption_annotation
[
'caption'
].
encode
(
'utf8'
))
return
captions
def
create_tf_example
(
image
,
image_dirs
,
panoptic_masks_dir
=
None
,
bbox_annotations
=
None
,
id_to_name_map
=
None
,
caption_annotations
=
None
,
panoptic_annotation
=
None
,
is_category_thing
=
None
,
include_panoptic_masks
=
False
,
include_masks
=
False
):
"""Converts image and annotations to a tf.Example proto.
Args:
image: dict with keys: [u'license', u'file_name', u'coco_url', u'height',
u'width', u'date_captured', u'flickr_url', u'id']
image_dirs: list of directories containing the image files.
panoptic_masks_dir: `str` of the panoptic masks directory.
bbox_annotations:
list of dicts with keys: [u'segmentation', u'area', u'iscrowd',
u'image_id', u'bbox', u'category_id', u'id'] Notice that bounding box
coordinates in the official COCO dataset are given as [x, y, width,
height] tuples using absolute coordinates where x, y represent the
top-left (0-indexed) corner. This function converts to the format
expected by the Tensorflow Object Detection API (which is which is
[ymin, xmin, ymax, xmax] with coordinates normalized relative to image
size).
id_to_name_map: a dict mapping category IDs to string names.
caption_annotations:
list of dict with keys: [u'id', u'image_id', u'str'].
panoptic_annotation: dict with keys: [u'image_id', u'file_name',
u'segments_info']. Where the value for segments_info is a list of dicts,
with each dict containing information for a single segment in the mask.
is_category_thing: `bool`, whether it is a category thing.
include_panoptic_masks: `bool`, whether to include panoptic masks.
include_masks: Whether to include instance segmentations masks
(PNG encoded) in the result. default: False.
Returns:
example: The converted tf.Example
num_annotations_skipped: Number of (invalid) annotations that were ignored.
Raises:
ValueError: if the image pointed to by data['filename'] is not a valid JPEG,
does not exist, or is not unique across image directories.
"""
image_height
=
image
[
'height'
]
image_width
=
image
[
'width'
]
filename
=
image
[
'file_name'
]
image_id
=
image
[
'id'
]
if
len
(
image_dirs
)
>
1
:
full_paths
=
[
os
.
path
.
join
(
image_dir
,
filename
)
for
image_dir
in
image_dirs
]
full_existing_paths
=
[
p
for
p
in
full_paths
if
tf
.
io
.
gfile
.
exists
(
p
)]
if
not
full_existing_paths
:
raise
ValueError
(
'{} does not exist across image directories.'
.
format
(
filename
))
if
len
(
full_existing_paths
)
>
1
:
raise
ValueError
(
'{} is not unique across image directories'
.
format
(
filename
))
full_path
,
=
full_existing_paths
# If there is only one image directory, it's not worth checking for existence,
# since trying to open the file will raise an informative error message if it
# does not exist.
else
:
image_dir
,
=
image_dirs
full_path
=
os
.
path
.
join
(
image_dir
,
filename
)
with
tf
.
io
.
gfile
.
GFile
(
full_path
,
'rb'
)
as
fid
:
encoded_jpg
=
fid
.
read
()
feature_dict
=
tfrecord_lib
.
image_info_to_feature_dict
(
image_height
,
image_width
,
filename
,
image_id
,
encoded_jpg
,
'jpg'
)
num_annotations_skipped
=
0
if
bbox_annotations
:
box_feature_dict
,
num_skipped
=
bbox_annotations_to_feature_dict
(
bbox_annotations
,
image_height
,
image_width
,
id_to_name_map
,
include_masks
)
num_annotations_skipped
+=
num_skipped
feature_dict
.
update
(
box_feature_dict
)
if
caption_annotations
:
encoded_captions
=
encode_caption_annotations
(
caption_annotations
)
feature_dict
.
update
(
{
'image/caption'
:
tfrecord_lib
.
convert_to_feature
(
encoded_captions
)})
if
panoptic_annotation
:
segments_info
=
panoptic_annotation
[
'segments_info'
]
panoptic_mask_filename
=
os
.
path
.
join
(
panoptic_masks_dir
,
panoptic_annotation
[
'file_name'
])
encoded_panoptic_masks
=
generate_coco_panoptics_masks
(
segments_info
,
panoptic_mask_filename
,
include_panoptic_masks
,
is_category_thing
)
feature_dict
.
update
(
{
'image/segmentation/class/encoded'
:
tfrecord_lib
.
convert_to_feature
(
encoded_panoptic_masks
[
'semantic_segmentation_mask'
])})
if
include_panoptic_masks
:
feature_dict
.
update
({
'image/panoptic/category_mask'
:
tfrecord_lib
.
convert_to_feature
(
encoded_panoptic_masks
[
'category_mask'
]),
'image/panoptic/instance_mask'
:
tfrecord_lib
.
convert_to_feature
(
encoded_panoptic_masks
[
'instance_mask'
])
})
example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
feature_dict
))
return
example
,
num_annotations_skipped
def
_load_object_annotations
(
object_annotations_file
):
"""Loads object annotation JSON file."""
with
tf
.
io
.
gfile
.
GFile
(
object_annotations_file
,
'r'
)
as
fid
:
obj_annotations
=
json
.
load
(
fid
)
images
=
obj_annotations
[
'images'
]
id_to_name_map
=
dict
((
element
[
'id'
],
element
[
'name'
])
for
element
in
obj_annotations
[
'categories'
])
img_to_obj_annotation
=
collections
.
defaultdict
(
list
)
logging
.
info
(
'Building bounding box index.'
)
for
annotation
in
obj_annotations
[
'annotations'
]:
image_id
=
annotation
[
'image_id'
]
img_to_obj_annotation
[
image_id
].
append
(
annotation
)
missing_annotation_count
=
0
for
image
in
images
:
image_id
=
image
[
'id'
]
if
image_id
not
in
img_to_obj_annotation
:
missing_annotation_count
+=
1
logging
.
info
(
'%d images are missing bboxes.'
,
missing_annotation_count
)
return
img_to_obj_annotation
,
id_to_name_map
def
_load_caption_annotations
(
caption_annotations_file
):
"""Loads caption annotation JSON file."""
with
tf
.
io
.
gfile
.
GFile
(
caption_annotations_file
,
'r'
)
as
fid
:
caption_annotations
=
json
.
load
(
fid
)
img_to_caption_annotation
=
collections
.
defaultdict
(
list
)
logging
.
info
(
'Building caption index.'
)
for
annotation
in
caption_annotations
[
'annotations'
]:
image_id
=
annotation
[
'image_id'
]
img_to_caption_annotation
[
image_id
].
append
(
annotation
)
missing_annotation_count
=
0
images
=
caption_annotations
[
'images'
]
for
image
in
images
:
image_id
=
image
[
'id'
]
if
image_id
not
in
img_to_caption_annotation
:
missing_annotation_count
+=
1
logging
.
info
(
'%d images are missing captions.'
,
missing_annotation_count
)
return
img_to_caption_annotation
def
_load_panoptic_annotations
(
panoptic_annotations_file
):
"""Loads panoptic annotation from file."""
with
tf
.
io
.
gfile
.
GFile
(
panoptic_annotations_file
,
'r'
)
as
fid
:
panoptic_annotations
=
json
.
load
(
fid
)
img_to_panoptic_annotation
=
dict
()
logging
.
info
(
'Building panoptic index.'
)
for
annotation
in
panoptic_annotations
[
'annotations'
]:
image_id
=
annotation
[
'image_id'
]
img_to_panoptic_annotation
[
image_id
]
=
annotation
is_category_thing
=
dict
()
for
category_info
in
panoptic_annotations
[
'categories'
]:
is_category_thing
[
category_info
[
'id'
]]
=
category_info
[
'isthing'
]
==
1
missing_annotation_count
=
0
images
=
panoptic_annotations
[
'images'
]
for
image
in
images
:
image_id
=
image
[
'id'
]
if
image_id
not
in
img_to_panoptic_annotation
:
missing_annotation_count
+=
1
logging
.
info
(
'%d images are missing panoptic annotations.'
,
missing_annotation_count
)
return
img_to_panoptic_annotation
,
is_category_thing
def
_load_images_info
(
images_info_file
):
with
tf
.
io
.
gfile
.
GFile
(
images_info_file
,
'r'
)
as
fid
:
info_dict
=
json
.
load
(
fid
)
return
info_dict
[
'images'
]
def
generate_annotations
(
images
,
image_dirs
,
panoptic_masks_dir
=
None
,
img_to_obj_annotation
=
None
,
img_to_caption_annotation
=
None
,
img_to_panoptic_annotation
=
None
,
is_category_thing
=
None
,
id_to_name_map
=
None
,
include_panoptic_masks
=
False
,
include_masks
=
False
):
"""Generator for COCO annotations."""
for
image
in
images
:
object_annotation
=
(
img_to_obj_annotation
.
get
(
image
[
'id'
],
None
)
if
img_to_obj_annotation
else
None
)
caption_annotaion
=
(
img_to_caption_annotation
.
get
(
image
[
'id'
],
None
)
if
img_to_caption_annotation
else
None
)
panoptic_annotation
=
(
img_to_panoptic_annotation
.
get
(
image
[
'id'
],
None
)
if
img_to_panoptic_annotation
else
None
)
yield
(
image
,
image_dirs
,
panoptic_masks_dir
,
object_annotation
,
id_to_name_map
,
caption_annotaion
,
panoptic_annotation
,
is_category_thing
,
include_panoptic_masks
,
include_masks
)
def
_create_tf_record_from_coco_annotations
(
images_info_file
,
image_dirs
,
output_path
,
num_shards
,
object_annotations_file
=
None
,
caption_annotations_file
=
None
,
panoptic_masks_dir
=
None
,
panoptic_annotations_file
=
None
,
include_panoptic_masks
=
False
,
include_masks
=
False
):
"""Loads COCO annotation json files and converts to tf.Record format.
Args:
images_info_file: JSON file containing image info. The number of tf.Examples
in the output tf Record files is exactly equal to the number of image info
entries in this file. This can be any of train/val/test annotation json
files Eg. 'image_info_test-dev2017.json',
'instance_annotations_train2017.json',
'caption_annotations_train2017.json', etc.
image_dirs: List of directories containing the image files.
output_path: Path to output tf.Record file.
num_shards: Number of output files to create.
object_annotations_file: JSON file containing bounding box annotations.
caption_annotations_file: JSON file containing caption annotations.
panoptic_masks_dir: Directory containing panoptic masks.
panoptic_annotations_file: JSON file containing panoptic annotations.
include_panoptic_masks: Whether to include 'category_mask'
and 'instance_mask', which is required by the panoptic quality evaluator.
include_masks: Whether to include instance segmentations masks
(PNG encoded) in the result. default: False.
"""
logging
.
info
(
'writing to output path: %s'
,
output_path
)
images
=
_load_images_info
(
images_info_file
)
img_to_obj_annotation
=
None
img_to_caption_annotation
=
None
id_to_name_map
=
None
img_to_panoptic_annotation
=
None
is_category_thing
=
None
if
object_annotations_file
:
img_to_obj_annotation
,
id_to_name_map
=
(
_load_object_annotations
(
object_annotations_file
))
if
caption_annotations_file
:
img_to_caption_annotation
=
(
_load_caption_annotations
(
caption_annotations_file
))
if
panoptic_annotations_file
:
img_to_panoptic_annotation
,
is_category_thing
=
(
_load_panoptic_annotations
(
panoptic_annotations_file
))
coco_annotations_iter
=
generate_annotations
(
images
=
images
,
image_dirs
=
image_dirs
,
panoptic_masks_dir
=
panoptic_masks_dir
,
img_to_obj_annotation
=
img_to_obj_annotation
,
img_to_caption_annotation
=
img_to_caption_annotation
,
img_to_panoptic_annotation
=
img_to_panoptic_annotation
,
is_category_thing
=
is_category_thing
,
id_to_name_map
=
id_to_name_map
,
include_panoptic_masks
=
include_panoptic_masks
,
include_masks
=
include_masks
)
num_skipped
=
tfrecord_lib
.
write_tf_record_dataset
(
output_path
,
coco_annotations_iter
,
create_tf_example
,
num_shards
)
logging
.
info
(
'Finished writing, skipped %d annotations.'
,
num_skipped
)
def
main
(
_
):
assert
FLAGS
.
image_dir
,
'`image_dir` missing.'
assert
(
FLAGS
.
image_info_file
or
FLAGS
.
object_annotations_file
or
FLAGS
.
caption_annotations_file
),
(
'All annotation files are '
'missing.'
)
if
FLAGS
.
image_info_file
:
images_info_file
=
FLAGS
.
image_info_file
elif
FLAGS
.
object_annotations_file
:
images_info_file
=
FLAGS
.
object_annotations_file
else
:
images_info_file
=
FLAGS
.
caption_annotations_file
directory
=
os
.
path
.
dirname
(
FLAGS
.
output_file_prefix
)
if
not
tf
.
io
.
gfile
.
isdir
(
directory
):
tf
.
io
.
gfile
.
makedirs
(
directory
)
_create_tf_record_from_coco_annotations
(
images_info_file
,
FLAGS
.
image_dir
,
FLAGS
.
output_file_prefix
,
FLAGS
.
num_shards
,
FLAGS
.
object_annotations_file
,
FLAGS
.
caption_annotations_file
,
FLAGS
.
panoptic_masks_dir
,
FLAGS
.
panoptic_annotations_file
,
FLAGS
.
include_panoptic_masks
,
FLAGS
.
include_masks
)
if
__name__
==
'__main__'
:
app
.
run
(
main
)
official/vision/data/process_coco_few_shot.sh
0 → 100644
View file @
0225b135
#!/bin/bash
#
# Processes the COCO few-shot benchmark into TFRecord files. Requires `wget`.
tmp_dir
=
$(
mktemp
-d
-t
coco-XXXXXXXXXX
)
base_image_dir
=
"/tmp/coco_images"
output_dir
=
"/tmp/coco_few_shot"
while
getopts
":i:o:"
o
;
do
case
"
${
o
}
"
in
o
)
output_dir
=
${
OPTARG
}
;;
i
)
base_image_dir
=
${
OPTARG
}
;;
*
)
echo
"Usage:
${
0
}
[-i <base_image_dir>] [-o <output_dir>]"
1>&2
;
exit
1
;;
esac
done
cocosplit_url
=
"dl.yf.io/fs-det/datasets/cocosplit"
wget
--recursive
--no-parent
-q
--show-progress
--progress
=
bar:force:noscroll
\
-P
"
${
tmp_dir
}
"
-A
"trainvalno5k.json,5k.json,*1shot*.json,*3shot*.json,*5shot*.json,*10shot*.json,*30shot*.json"
\
"http://
${
cocosplit_url
}
/"
mv
"
${
tmp_dir
}
/
${
cocosplit_url
}
/"
*
"
${
tmp_dir
}
"
rm
-rf
"
${
tmp_dir
}
/
${
cocosplit_url
}
/"
python process_coco_few_shot_json_files.py
\
--logtostderr
--workdir
=
"
${
tmp_dir
}
"
for
seed
in
{
0..9
}
;
do
for
shots
in
1 3 5 10 30
;
do
python create_coco_tf_record.py
\
--logtostderr
\
--image_dir
=
"
${
base_image_dir
}
/train2014"
\
--image_dir
=
"
${
base_image_dir
}
/val2014"
\
--image_info_file
=
"
${
tmp_dir
}
/
${
shots
}
shot_seed
${
seed
}
.json"
\
--object_annotations_file
=
"
${
tmp_dir
}
/
${
shots
}
shot_seed
${
seed
}
.json"
\
--caption_annotations_file
=
""
\
--output_file_prefix
=
"
${
output_dir
}
/
${
shots
}
shot_seed
${
seed
}
"
\
--num_shards
=
4
done
done
python create_coco_tf_record.py
\
--logtostderr
\
--image_dir
=
"
${
base_image_dir
}
/train2014"
\
--image_dir
=
"
${
base_image_dir
}
/val2014"
\
--image_info_file
=
"
${
tmp_dir
}
/datasplit/5k.json"
\
--object_annotations_file
=
"
${
tmp_dir
}
/datasplit/5k.json"
\
--caption_annotations_file
=
""
\
--output_file_prefix
=
"
${
output_dir
}
/5k"
\
--num_shards
=
10
python create_coco_tf_record.py
\
--logtostderr
\
--image_dir
=
"
${
base_image_dir
}
/train2014"
\
--image_dir
=
"
${
base_image_dir
}
/val2014"
\
--image_info_file
=
"
${
tmp_dir
}
/datasplit/trainvalno5k_base.json"
\
--object_annotations_file
=
"
${
tmp_dir
}
/datasplit/trainvalno5k_base.json"
\
--caption_annotations_file
=
""
\
--output_file_prefix
=
"
${
output_dir
}
/trainvalno5k_base"
\
--num_shards
=
200
python create_coco_tf_record.py
\
--logtostderr
\
--image_dir
=
"
${
base_image_dir
}
/train2014"
\
--image_dir
=
"
${
base_image_dir
}
/val2014"
\
--image_info_file
=
"
${
tmp_dir
}
/datasplit/5k_base.json"
\
--object_annotations_file
=
"
${
tmp_dir
}
/datasplit/5k_base.json"
\
--caption_annotations_file
=
""
\
--output_file_prefix
=
"
${
output_dir
}
/5k_base"
\
--num_shards
=
10
rm
-rf
"
${
tmp_dir
}
"
official/vision/data/process_coco_few_shot_json_files.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Processes the JSON files for COCO few-shot.
We assume that `workdir` mirrors the contents of
http://dl.yf.io/fs-det/datasets/cocosplit/, which contains the official JSON
files for the few-shot COCO evaluation procedure that Wang et al. (2020)'s
"Frustratingly Simple Few-Shot Object Detection" paper uses.
"""
import
collections
import
itertools
import
json
import
logging
import
os
from
absl
import
app
from
absl
import
flags
import
tensorflow
as
tf
logger
=
tf
.
get_logger
()
logger
.
setLevel
(
logging
.
INFO
)
flags
.
DEFINE_string
(
'workdir'
,
None
,
'Working directory.'
)
FLAGS
=
flags
.
FLAGS
CATEGORIES
=
[
'airplane'
,
'apple'
,
'backpack'
,
'banana'
,
'baseball bat'
,
'baseball glove'
,
'bear'
,
'bed'
,
'bench'
,
'bicycle'
,
'bird'
,
'boat'
,
'book'
,
'bottle'
,
'bowl'
,
'broccoli'
,
'bus'
,
'cake'
,
'car'
,
'carrot'
,
'cat'
,
'cell phone'
,
'chair'
,
'clock'
,
'couch'
,
'cow'
,
'cup'
,
'dining table'
,
'dog'
,
'donut'
,
'elephant'
,
'fire hydrant'
,
'fork'
,
'frisbee'
,
'giraffe'
,
'hair drier'
,
'handbag'
,
'horse'
,
'hot dog'
,
'keyboard'
,
'kite'
,
'knife'
,
'laptop'
,
'microwave'
,
'motorcycle'
,
'mouse'
,
'orange'
,
'oven'
,
'parking meter'
,
'person'
,
'pizza'
,
'potted plant'
,
'refrigerator'
,
'remote'
,
'sandwich'
,
'scissors'
,
'sheep'
,
'sink'
,
'skateboard'
,
'skis'
,
'snowboard'
,
'spoon'
,
'sports ball'
,
'stop sign'
,
'suitcase'
,
'surfboard'
,
'teddy bear'
,
'tennis racket'
,
'tie'
,
'toaster'
,
'toilet'
,
'toothbrush'
,
'traffic light'
,
'train'
,
'truck'
,
'tv'
,
'umbrella'
,
'vase'
,
'wine glass'
,
'zebra'
]
SEEDS
=
list
(
range
(
10
))
SHOTS
=
[
1
,
3
,
5
,
10
,
30
]
FILE_SUFFIXES
=
collections
.
defaultdict
(
list
)
for
_seed
,
_shots
in
itertools
.
product
(
SEEDS
,
SHOTS
):
for
_category
in
CATEGORIES
:
FILE_SUFFIXES
[(
_seed
,
_shots
)].
append
(
'{}full_box_{}shot_{}_trainval.json'
.
format
(
# http://dl.yf.io/fs-det/datasets/cocosplit/ is organized like so:
#
# datasplit/
# trainvalno5k.json
# 5k.json
# full_box_{1,2,3,5,10,30}shot_{category}_trainval.json
# seed{1-9}/
# full_box_{1,2,3,5,10,30}shot_{category}_trainval.json
#
# This means that the JSON files for seed0 are located in the root
# directory rather than in a `seed?/` subdirectory, hence the
# conditional expression below.
''
if
_seed
==
0
else
'seed{}/'
.
format
(
_seed
),
_shots
,
_category
))
# Base class IDs, as defined in
# https://github.com/ucbdrive/few-shot-object-detection/blob/master/fsdet/evaluation/coco_evaluation.py#L60-L65
BASE_CLASS_IDS
=
[
8
,
10
,
11
,
13
,
14
,
15
,
22
,
23
,
24
,
25
,
27
,
28
,
31
,
32
,
33
,
34
,
35
,
36
,
37
,
38
,
39
,
40
,
41
,
42
,
43
,
46
,
47
,
48
,
49
,
50
,
51
,
52
,
53
,
54
,
55
,
56
,
57
,
58
,
59
,
60
,
61
,
65
,
70
,
73
,
74
,
75
,
76
,
77
,
78
,
79
,
80
,
81
,
82
,
84
,
85
,
86
,
87
,
88
,
89
,
90
]
def
main
(
unused_argv
):
workdir
=
FLAGS
.
workdir
# Filter novel class annotations from the training and validation sets.
for
name
in
(
'trainvalno5k'
,
'5k'
):
file_path
=
os
.
path
.
join
(
workdir
,
'datasplit'
,
'{}.json'
.
format
(
name
))
with
tf
.
io
.
gfile
.
GFile
(
file_path
,
'r'
)
as
f
:
json_dict
=
json
.
load
(
f
)
json_dict
[
'annotations'
]
=
[
a
for
a
in
json_dict
[
'annotations'
]
if
a
[
'category_id'
]
in
BASE_CLASS_IDS
]
output_path
=
os
.
path
.
join
(
workdir
,
'datasplit'
,
'{}_base.json'
.
format
(
name
))
with
tf
.
io
.
gfile
.
GFile
(
output_path
,
'w'
)
as
f
:
json
.
dump
(
json_dict
,
f
)
for
seed
,
shots
in
itertools
.
product
(
SEEDS
,
SHOTS
):
# Retrieve all examples for a given seed and shots setting.
file_paths
=
[
os
.
path
.
join
(
workdir
,
suffix
)
for
suffix
in
FILE_SUFFIXES
[(
seed
,
shots
)]]
json_dicts
=
[]
for
file_path
in
file_paths
:
with
tf
.
io
.
gfile
.
GFile
(
file_path
,
'r'
)
as
f
:
json_dicts
.
append
(
json
.
load
(
f
))
# Make sure that all JSON files for a given seed and shots setting have the
# same metadata. We count on this to fuse them later on.
metadata_dicts
=
[{
'info'
:
d
[
'info'
],
'licenses'
:
d
[
'licenses'
],
'categories'
:
d
[
'categories'
]}
for
d
in
json_dicts
]
if
not
all
(
d
==
metadata_dicts
[
0
]
for
d
in
metadata_dicts
[
1
:]):
raise
RuntimeError
(
'JSON files for {} shots (seed {}) '
.
format
(
shots
,
seed
)
+
'have different info, licences, or categories fields'
)
# Retrieve images across all JSON files.
images
=
sum
((
d
[
'images'
]
for
d
in
json_dicts
),
[])
# Remove duplicate image entries.
images
=
list
({
image
[
'id'
]:
image
for
image
in
images
}.
values
())
output_dict
=
{
'info'
:
json_dicts
[
0
][
'info'
],
'licenses'
:
json_dicts
[
0
][
'licenses'
],
'categories'
:
json_dicts
[
0
][
'categories'
],
'images'
:
images
,
'annotations'
:
sum
((
d
[
'annotations'
]
for
d
in
json_dicts
),
[])
}
output_path
=
os
.
path
.
join
(
workdir
,
'{}shot_seed{}.json'
.
format
(
shots
,
seed
))
with
tf
.
io
.
gfile
.
GFile
(
output_path
,
'w'
)
as
f
:
json
.
dump
(
output_dict
,
f
)
logger
.
info
(
'Processed %d shots (seed %d) and saved to %s'
,
shots
,
seed
,
output_path
)
if
__name__
==
'__main__'
:
flags
.
mark_flag_as_required
(
'workdir'
)
app
.
run
(
main
)
official/vision/data/process_coco_panoptic.sh
0 → 100644
View file @
0225b135
#!/bin/bash
sudo
apt update
sudo
apt
install
unzip aria2
-y
DATA_DIR
=
$1
aria2c
-j
8
-Z
\
http://images.cocodataset.org/annotations/annotations_trainval2017.zip
\
http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip
\
http://images.cocodataset.org/zips/train2017.zip
\
http://images.cocodataset.org/zips/val2017.zip
\
--dir
=
$DATA_DIR
;
unzip
$DATA_DIR
/
"*"
.zip
-d
$DATA_DIR
;
mkdir
$DATA_DIR
/zips
&&
mv
$DATA_DIR
/
*
.zip
$DATA_DIR
/zips
;
unzip
$DATA_DIR
/annotations/panoptic_train2017.zip
-d
$DATA_DIR
unzip
$DATA_DIR
/annotations/panoptic_val2017.zip
-d
$DATA_DIR
python3 official/vision/beta/data/create_coco_tf_record.py
\
--logtostderr
\
--image_dir
=
"
$DATA_DIR
/val2017"
\
--object_annotations_file
=
"
$DATA_DIR
/annotations/instances_val2017.json"
\
--output_file_prefix
=
"
$DATA_DIR
/tfrecords/val"
\
--panoptic_annotations_file
=
"
$DATA_DIR
/annotations/panoptic_val2017.json"
\
--panoptic_masks_dir
=
"
$DATA_DIR
/panoptic_val2017"
\
--num_shards
=
8
\
--include_masks
\
--include_panoptic_masks
python3 official/vision/beta/data/create_coco_tf_record.py
\
--logtostderr
\
--image_dir
=
"
$DATA_DIR
/train2017"
\
--object_annotations_file
=
"
$DATA_DIR
/annotations/instances_train2017.json"
\
--output_file_prefix
=
"
$DATA_DIR
/tfrecords/train"
\
--panoptic_annotations_file
=
"
$DATA_DIR
/annotations/panoptic_train2017.json"
\
--panoptic_masks_dir
=
"
$DATA_DIR
/panoptic_train2017"
\
--num_shards
=
32
\
--include_masks
\
--include_panoptic_masks
official/vision/data/tfrecord_lib.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Helper functions for creating TFRecord datasets."""
import
hashlib
import
io
import
itertools
from
absl
import
logging
import
numpy
as
np
from
PIL
import
Image
import
tensorflow
as
tf
import
multiprocessing
as
mp
def
convert_to_feature
(
value
,
value_type
=
None
):
"""Converts the given python object to a tf.train.Feature.
Args:
value: int, float, bytes or a list of them.
value_type: optional, if specified, forces the feature to be of the given
type. Otherwise, type is inferred automatically. Can be one of
['bytes', 'int64', 'float', 'bytes_list', 'int64_list', 'float_list']
Returns:
feature: A tf.train.Feature object.
"""
if
value_type
is
None
:
element
=
value
[
0
]
if
isinstance
(
value
,
list
)
else
value
if
isinstance
(
element
,
bytes
):
value_type
=
'bytes'
elif
isinstance
(
element
,
(
int
,
np
.
integer
)):
value_type
=
'int64'
elif
isinstance
(
element
,
(
float
,
np
.
floating
)):
value_type
=
'float'
else
:
raise
ValueError
(
'Cannot convert type {} to feature'
.
format
(
type
(
element
)))
if
isinstance
(
value
,
list
):
value_type
=
value_type
+
'_list'
if
value_type
==
'int64'
:
return
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
[
value
]))
elif
value_type
==
'int64_list'
:
value
=
np
.
asarray
(
value
).
astype
(
np
.
int64
).
reshape
(
-
1
)
return
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
value
))
elif
value_type
==
'float'
:
return
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
[
value
]))
elif
value_type
==
'float_list'
:
value
=
np
.
asarray
(
value
).
astype
(
np
.
float32
).
reshape
(
-
1
)
return
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
value
))
elif
value_type
==
'bytes'
:
return
tf
.
train
.
Feature
(
bytes_list
=
tf
.
train
.
BytesList
(
value
=
[
value
]))
elif
value_type
==
'bytes_list'
:
return
tf
.
train
.
Feature
(
bytes_list
=
tf
.
train
.
BytesList
(
value
=
value
))
else
:
raise
ValueError
(
'Unknown value_type parameter - {}'
.
format
(
value_type
))
def
image_info_to_feature_dict
(
height
,
width
,
filename
,
image_id
,
encoded_str
,
encoded_format
):
"""Convert image information to a dict of features."""
key
=
hashlib
.
sha256
(
encoded_str
).
hexdigest
()
return
{
'image/height'
:
convert_to_feature
(
height
),
'image/width'
:
convert_to_feature
(
width
),
'image/filename'
:
convert_to_feature
(
filename
.
encode
(
'utf8'
)),
'image/source_id'
:
convert_to_feature
(
str
(
image_id
).
encode
(
'utf8'
)),
'image/key/sha256'
:
convert_to_feature
(
key
.
encode
(
'utf8'
)),
'image/encoded'
:
convert_to_feature
(
encoded_str
),
'image/format'
:
convert_to_feature
(
encoded_format
.
encode
(
'utf8'
)),
}
def
read_image
(
image_path
):
pil_image
=
Image
.
open
(
image_path
)
return
np
.
asarray
(
pil_image
)
def
encode_mask_as_png
(
mask
):
pil_image
=
Image
.
fromarray
(
mask
)
output_io
=
io
.
BytesIO
()
pil_image
.
save
(
output_io
,
format
=
'PNG'
)
return
output_io
.
getvalue
()
def
write_tf_record_dataset
(
output_path
,
annotation_iterator
,
process_func
,
num_shards
,
use_multiprocessing
=
True
,
unpack_arguments
=
True
):
"""Iterates over annotations, processes them and writes into TFRecords.
Args:
output_path: The prefix path to create TF record files.
annotation_iterator: An iterator of tuples containing details about the
dataset.
process_func: A function which takes the elements from the tuples of
annotation_iterator as arguments and returns a tuple of (tf.train.Example,
int). The integer indicates the number of annotations that were skipped.
num_shards: int, the number of shards to write for the dataset.
use_multiprocessing:
Whether or not to use multiple processes to write TF Records.
unpack_arguments:
Whether to unpack the tuples from annotation_iterator as individual
arguments to the process func or to pass the returned value as it is.
Returns:
num_skipped: The total number of skipped annotations.
"""
writers
=
[
tf
.
io
.
TFRecordWriter
(
output_path
+
'-%05d-of-%05d.tfrecord'
%
(
i
,
num_shards
))
for
i
in
range
(
num_shards
)
]
total_num_annotations_skipped
=
0
if
use_multiprocessing
:
pool
=
mp
.
Pool
()
if
unpack_arguments
:
tf_example_iterator
=
pool
.
starmap
(
process_func
,
annotation_iterator
)
else
:
tf_example_iterator
=
pool
.
imap
(
process_func
,
annotation_iterator
)
else
:
if
unpack_arguments
:
tf_example_iterator
=
itertools
.
starmap
(
process_func
,
annotation_iterator
)
else
:
tf_example_iterator
=
map
(
process_func
,
annotation_iterator
)
for
idx
,
(
tf_example
,
num_annotations_skipped
)
in
enumerate
(
tf_example_iterator
):
if
idx
%
100
==
0
:
logging
.
info
(
'On image %d'
,
idx
)
total_num_annotations_skipped
+=
num_annotations_skipped
writers
[
idx
%
num_shards
].
write
(
tf_example
.
SerializeToString
())
if
use_multiprocessing
:
pool
.
close
()
pool
.
join
()
for
writer
in
writers
:
writer
.
close
()
logging
.
info
(
'Finished writing, skipped %d annotations.'
,
total_num_annotations_skipped
)
return
total_num_annotations_skipped
def
check_and_make_dir
(
directory
):
"""Creates the directory if it doesn't exist."""
if
not
tf
.
io
.
gfile
.
isdir
(
directory
):
tf
.
io
.
gfile
.
makedirs
(
directory
)
official/vision/data/tfrecord_lib_test.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for tfrecord_lib."""
import
os
from
absl
import
flags
from
absl.testing
import
parameterized
import
tensorflow
as
tf
from
official.vision.data
import
tfrecord_lib
FLAGS
=
flags
.
FLAGS
def
process_sample
(
x
):
d
=
{
'x'
:
x
}
return
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
d
)),
0
def
parse_function
(
example_proto
):
feature_description
=
{
'x'
:
tf
.
io
.
FixedLenFeature
([],
tf
.
int64
,
default_value
=-
1
)
}
return
tf
.
io
.
parse_single_example
(
example_proto
,
feature_description
)
class
TfrecordLibTest
(
parameterized
.
TestCase
):
def
test_write_tf_record_dataset
(
self
):
data
=
[(
tfrecord_lib
.
convert_to_feature
(
i
),)
for
i
in
range
(
17
)]
path
=
os
.
path
.
join
(
FLAGS
.
test_tmpdir
,
'train'
)
tfrecord_lib
.
write_tf_record_dataset
(
path
,
data
,
process_sample
,
3
,
use_multiprocessing
=
False
)
tfrecord_files
=
tf
.
io
.
gfile
.
glob
(
path
+
'*'
)
self
.
assertLen
(
tfrecord_files
,
3
)
dataset
=
tf
.
data
.
TFRecordDataset
(
tfrecord_files
)
dataset
=
dataset
.
map
(
parse_function
)
read_values
=
set
(
d
[
'x'
]
for
d
in
dataset
.
as_numpy_iterator
())
self
.
assertSetEqual
(
read_values
,
set
(
range
(
17
)))
def
test_convert_to_feature_float
(
self
):
proto
=
tfrecord_lib
.
convert_to_feature
(
0.0
)
self
.
assertEqual
(
proto
.
float_list
.
value
[
0
],
0.0
)
def
test_convert_to_feature_int
(
self
):
proto
=
tfrecord_lib
.
convert_to_feature
(
0
)
self
.
assertEqual
(
proto
.
int64_list
.
value
[
0
],
0
)
def
test_convert_to_feature_bytes
(
self
):
proto
=
tfrecord_lib
.
convert_to_feature
(
b
'123'
)
self
.
assertEqual
(
proto
.
bytes_list
.
value
[
0
],
b
'123'
)
def
test_convert_to_feature_float_list
(
self
):
proto
=
tfrecord_lib
.
convert_to_feature
([
0.0
,
1.0
])
self
.
assertSequenceAlmostEqual
(
proto
.
float_list
.
value
,
[
0.0
,
1.0
])
def
test_convert_to_feature_int_list
(
self
):
proto
=
tfrecord_lib
.
convert_to_feature
([
0
,
1
])
self
.
assertSequenceAlmostEqual
(
proto
.
int64_list
.
value
,
[
0
,
1
])
def
test_convert_to_feature_bytes_list
(
self
):
proto
=
tfrecord_lib
.
convert_to_feature
([
b
'123'
,
b
'456'
])
self
.
assertSequenceAlmostEqual
(
proto
.
bytes_list
.
value
,
[
b
'123'
,
b
'456'
])
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/dataloaders/__init__.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
official/vision/dataloaders/classification_input.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Classification decoder and parser."""
from
typing
import
Any
,
Dict
,
List
,
Optional
# Import libraries
import
tensorflow
as
tf
from
official.vision.configs
import
common
from
official.vision.dataloaders
import
decoder
from
official.vision.dataloaders
import
parser
from
official.vision.ops
import
augment
from
official.vision.ops
import
preprocess_ops
MEAN_RGB
=
(
0.485
*
255
,
0.456
*
255
,
0.406
*
255
)
STDDEV_RGB
=
(
0.229
*
255
,
0.224
*
255
,
0.225
*
255
)
DEFAULT_IMAGE_FIELD_KEY
=
'image/encoded'
DEFAULT_LABEL_FIELD_KEY
=
'image/class/label'
class
Decoder
(
decoder
.
Decoder
):
"""A tf.Example decoder for classification task."""
def
__init__
(
self
,
image_field_key
:
str
=
DEFAULT_IMAGE_FIELD_KEY
,
label_field_key
:
str
=
DEFAULT_LABEL_FIELD_KEY
,
is_multilabel
:
bool
=
False
,
keys_to_features
:
Optional
[
Dict
[
str
,
Any
]]
=
None
):
if
not
keys_to_features
:
keys_to_features
=
{
image_field_key
:
tf
.
io
.
FixedLenFeature
((),
tf
.
string
,
default_value
=
''
),
}
if
is_multilabel
:
keys_to_features
.
update
(
{
label_field_key
:
tf
.
io
.
VarLenFeature
(
dtype
=
tf
.
int64
)})
else
:
keys_to_features
.
update
({
label_field_key
:
tf
.
io
.
FixedLenFeature
((),
tf
.
int64
,
default_value
=-
1
)
})
self
.
_keys_to_features
=
keys_to_features
def
decode
(
self
,
serialized_example
):
return
tf
.
io
.
parse_single_example
(
serialized_example
,
self
.
_keys_to_features
)
class
Parser
(
parser
.
Parser
):
"""Parser to parse an image and its annotations into a dictionary of tensors."""
def
__init__
(
self
,
output_size
:
List
[
int
],
num_classes
:
float
,
image_field_key
:
str
=
DEFAULT_IMAGE_FIELD_KEY
,
label_field_key
:
str
=
DEFAULT_LABEL_FIELD_KEY
,
decode_jpeg_only
:
bool
=
True
,
aug_rand_hflip
:
bool
=
True
,
aug_type
:
Optional
[
common
.
Augmentation
]
=
None
,
color_jitter
:
float
=
0.
,
random_erasing
:
Optional
[
common
.
RandomErasing
]
=
None
,
is_multilabel
:
bool
=
False
,
dtype
:
str
=
'float32'
):
"""Initializes parameters for parsing annotations in the dataset.
Args:
output_size: `Tensor` or `list` for [height, width] of output image. The
output_size should be divided by the largest feature stride 2^max_level.
num_classes: `float`, number of classes.
image_field_key: `str`, the key name to encoded image in tf.Example.
label_field_key: `str`, the key name to label in tf.Example.
decode_jpeg_only: `bool`, if True, only JPEG format is decoded, this is
faster than decoding other types. Default is True.
aug_rand_hflip: `bool`, if True, augment training with random
horizontal flip.
aug_type: An optional Augmentation object to choose from AutoAugment and
RandAugment.
color_jitter: Magnitude of color jitter. If > 0, the value is used to
generate random scale factor for brightness, contrast and saturation.
See `preprocess_ops.color_jitter` for more details.
random_erasing: if not None, augment input image by random erasing. See
`augment.RandomErasing` for more details.
is_multilabel: A `bool`, whether or not each example has multiple labels.
dtype: `str`, cast output image in dtype. It can be 'float32', 'float16',
or 'bfloat16'.
"""
self
.
_output_size
=
output_size
self
.
_aug_rand_hflip
=
aug_rand_hflip
self
.
_num_classes
=
num_classes
self
.
_image_field_key
=
image_field_key
if
dtype
==
'float32'
:
self
.
_dtype
=
tf
.
float32
elif
dtype
==
'float16'
:
self
.
_dtype
=
tf
.
float16
elif
dtype
==
'bfloat16'
:
self
.
_dtype
=
tf
.
bfloat16
else
:
raise
ValueError
(
'dtype {!r} is not supported!'
.
format
(
dtype
))
if
aug_type
:
if
aug_type
.
type
==
'autoaug'
:
self
.
_augmenter
=
augment
.
AutoAugment
(
augmentation_name
=
aug_type
.
autoaug
.
augmentation_name
,
cutout_const
=
aug_type
.
autoaug
.
cutout_const
,
translate_const
=
aug_type
.
autoaug
.
translate_const
)
elif
aug_type
.
type
==
'randaug'
:
self
.
_augmenter
=
augment
.
RandAugment
(
num_layers
=
aug_type
.
randaug
.
num_layers
,
magnitude
=
aug_type
.
randaug
.
magnitude
,
cutout_const
=
aug_type
.
randaug
.
cutout_const
,
translate_const
=
aug_type
.
randaug
.
translate_const
,
prob_to_apply
=
aug_type
.
randaug
.
prob_to_apply
,
exclude_ops
=
aug_type
.
randaug
.
exclude_ops
)
else
:
raise
ValueError
(
'Augmentation policy {} not supported.'
.
format
(
aug_type
.
type
))
else
:
self
.
_augmenter
=
None
self
.
_label_field_key
=
label_field_key
self
.
_color_jitter
=
color_jitter
if
random_erasing
:
self
.
_random_erasing
=
augment
.
RandomErasing
(
probability
=
random_erasing
.
probability
,
min_area
=
random_erasing
.
min_area
,
max_area
=
random_erasing
.
max_area
,
min_aspect
=
random_erasing
.
min_aspect
,
max_aspect
=
random_erasing
.
max_aspect
,
min_count
=
random_erasing
.
min_count
,
max_count
=
random_erasing
.
max_count
,
trials
=
random_erasing
.
trials
)
else
:
self
.
_random_erasing
=
None
self
.
_is_multilabel
=
is_multilabel
self
.
_decode_jpeg_only
=
decode_jpeg_only
def
_parse_train_data
(
self
,
decoded_tensors
):
"""Parses data for training."""
image
=
self
.
_parse_train_image
(
decoded_tensors
)
label
=
tf
.
cast
(
decoded_tensors
[
self
.
_label_field_key
],
dtype
=
tf
.
int32
)
if
self
.
_is_multilabel
:
if
isinstance
(
label
,
tf
.
sparse
.
SparseTensor
):
label
=
tf
.
sparse
.
to_dense
(
label
)
label
=
tf
.
reduce_sum
(
tf
.
one_hot
(
label
,
self
.
_num_classes
),
axis
=
0
)
return
image
,
label
def
_parse_eval_data
(
self
,
decoded_tensors
):
"""Parses data for evaluation."""
image
=
self
.
_parse_eval_image
(
decoded_tensors
)
label
=
tf
.
cast
(
decoded_tensors
[
self
.
_label_field_key
],
dtype
=
tf
.
int32
)
if
self
.
_is_multilabel
:
if
isinstance
(
label
,
tf
.
sparse
.
SparseTensor
):
label
=
tf
.
sparse
.
to_dense
(
label
)
label
=
tf
.
reduce_sum
(
tf
.
one_hot
(
label
,
self
.
_num_classes
),
axis
=
0
)
return
image
,
label
def
_parse_train_image
(
self
,
decoded_tensors
):
"""Parses image data for training."""
image_bytes
=
decoded_tensors
[
self
.
_image_field_key
]
if
self
.
_decode_jpeg_only
:
image_shape
=
tf
.
image
.
extract_jpeg_shape
(
image_bytes
)
# Crops image.
cropped_image
=
preprocess_ops
.
random_crop_image_v2
(
image_bytes
,
image_shape
)
image
=
tf
.
cond
(
tf
.
reduce_all
(
tf
.
equal
(
tf
.
shape
(
cropped_image
),
image_shape
)),
lambda
:
preprocess_ops
.
center_crop_image_v2
(
image_bytes
,
image_shape
),
lambda
:
cropped_image
)
else
:
# Decodes image.
image
=
tf
.
io
.
decode_image
(
image_bytes
,
channels
=
3
)
image
.
set_shape
([
None
,
None
,
3
])
# Crops image.
cropped_image
=
preprocess_ops
.
random_crop_image
(
image
)
image
=
tf
.
cond
(
tf
.
reduce_all
(
tf
.
equal
(
tf
.
shape
(
cropped_image
),
tf
.
shape
(
image
))),
lambda
:
preprocess_ops
.
center_crop_image
(
image
),
lambda
:
cropped_image
)
if
self
.
_aug_rand_hflip
:
image
=
tf
.
image
.
random_flip_left_right
(
image
)
# Color jitter.
if
self
.
_color_jitter
>
0
:
image
=
preprocess_ops
.
color_jitter
(
image
,
self
.
_color_jitter
,
self
.
_color_jitter
,
self
.
_color_jitter
)
# Resizes image.
image
=
tf
.
image
.
resize
(
image
,
self
.
_output_size
,
method
=
tf
.
image
.
ResizeMethod
.
BILINEAR
)
image
.
set_shape
([
self
.
_output_size
[
0
],
self
.
_output_size
[
1
],
3
])
# Apply autoaug or randaug.
if
self
.
_augmenter
is
not
None
:
image
=
self
.
_augmenter
.
distort
(
image
)
# Normalizes image with mean and std pixel values.
image
=
preprocess_ops
.
normalize_image
(
image
,
offset
=
MEAN_RGB
,
scale
=
STDDEV_RGB
)
# Random erasing after the image has been normalized
if
self
.
_random_erasing
is
not
None
:
image
=
self
.
_random_erasing
.
distort
(
image
)
# Convert image to self._dtype.
image
=
tf
.
image
.
convert_image_dtype
(
image
,
self
.
_dtype
)
return
image
def
_parse_eval_image
(
self
,
decoded_tensors
):
"""Parses image data for evaluation."""
image_bytes
=
decoded_tensors
[
self
.
_image_field_key
]
if
self
.
_decode_jpeg_only
:
image_shape
=
tf
.
image
.
extract_jpeg_shape
(
image_bytes
)
# Center crops.
image
=
preprocess_ops
.
center_crop_image_v2
(
image_bytes
,
image_shape
)
else
:
# Decodes image.
image
=
tf
.
io
.
decode_image
(
image_bytes
,
channels
=
3
)
image
.
set_shape
([
None
,
None
,
3
])
# Center crops.
image
=
preprocess_ops
.
center_crop_image
(
image
)
image
=
tf
.
image
.
resize
(
image
,
self
.
_output_size
,
method
=
tf
.
image
.
ResizeMethod
.
BILINEAR
)
image
.
set_shape
([
self
.
_output_size
[
0
],
self
.
_output_size
[
1
],
3
])
# Normalizes image with mean and std pixel values.
image
=
preprocess_ops
.
normalize_image
(
image
,
offset
=
MEAN_RGB
,
scale
=
STDDEV_RGB
)
# Convert image to self._dtype.
image
=
tf
.
image
.
convert_image_dtype
(
image
,
self
.
_dtype
)
return
image
@
classmethod
def
inference_fn
(
cls
,
image
:
tf
.
Tensor
,
input_image_size
:
List
[
int
],
num_channels
:
int
=
3
)
->
tf
.
Tensor
:
"""Builds image model inputs for serving."""
image
=
tf
.
cast
(
image
,
dtype
=
tf
.
float32
)
image
=
preprocess_ops
.
center_crop_image
(
image
)
image
=
tf
.
image
.
resize
(
image
,
input_image_size
,
method
=
tf
.
image
.
ResizeMethod
.
BILINEAR
)
# Normalizes image with mean and std pixel values.
image
=
preprocess_ops
.
normalize_image
(
image
,
offset
=
MEAN_RGB
,
scale
=
STDDEV_RGB
)
image
.
set_shape
(
input_image_size
+
[
num_channels
])
return
image
official/vision/dataloaders/decoder.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The generic decoder interface."""
import
abc
class
Decoder
(
object
):
"""Decodes the raw data into tensors."""
__metaclass__
=
abc
.
ABCMeta
@
abc
.
abstractmethod
def
decode
(
self
,
serialized_example
):
"""Decodes the serialized example into tensors.
Args:
serialized_example: a serialized string tensor that encodes the data.
Returns:
decoded_tensors: a dict of Tensors.
"""
pass
official/vision/dataloaders/input_reader.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Dataset reader for vision model garden."""
from
typing
import
Any
,
Callable
,
Optional
,
Tuple
import
tensorflow
as
tf
from
official.core
import
config_definitions
as
cfg
from
official.core
import
input_reader
def
calculate_batch_sizes
(
total_batch_size
:
int
,
pseudo_label_ratio
:
float
)
->
Tuple
[
int
,
int
]:
"""Calculates labeled and pseudo-labeled dataset batch sizes.
Returns (labeled_batch_size, pseudo_labeled_batch_size) given a
total batch size and pseudo-label data ratio.
Args:
total_batch_size: The total batch size for all data.
pseudo_label_ratio: A non-negative float ratio of pseudo-labeled
to labeled data in a batch.
Returns:
(labeled_batch_size, pseudo_labeled_batch_size) as ints.
Raises:
ValueError: If total_batch_size is negative.
ValueError: If pseudo_label_ratio is negative.
"""
if
total_batch_size
<
0
:
raise
ValueError
(
'Invalid total_batch_size: {}'
.
format
(
total_batch_size
))
if
pseudo_label_ratio
<
0.0
:
raise
ValueError
(
'Invalid pseudo_label_ratio: {}'
.
format
(
pseudo_label_ratio
))
ratio_factor
=
pseudo_label_ratio
/
(
1.0
+
pseudo_label_ratio
)
pseudo_labeled_batch_size
=
int
(
round
(
total_batch_size
*
ratio_factor
))
labeled_batch_size
=
total_batch_size
-
pseudo_labeled_batch_size
return
labeled_batch_size
,
pseudo_labeled_batch_size
class
CombinationDatasetInputReader
(
input_reader
.
InputReader
):
"""Combination dataset input reader."""
def
__init__
(
self
,
params
:
cfg
.
DataConfig
,
dataset_fn
=
tf
.
data
.
TFRecordDataset
,
pseudo_label_dataset_fn
=
tf
.
data
.
TFRecordDataset
,
decoder_fn
:
Optional
[
Callable
[...,
Any
]]
=
None
,
sample_fn
:
Optional
[
Callable
[...,
Any
]]
=
None
,
parser_fn
:
Optional
[
Callable
[...,
Any
]]
=
None
,
transform_and_batch_fn
:
Optional
[
Callable
[
[
tf
.
data
.
Dataset
,
Optional
[
tf
.
distribute
.
InputContext
]],
tf
.
data
.
Dataset
]]
=
None
,
postprocess_fn
:
Optional
[
Callable
[...,
Any
]]
=
None
):
"""Initializes an CombinationDatasetInputReader instance.
This class mixes a labeled and pseudo-labeled dataset. The params
must contain "pseudo_label_data.input_path" to specify the
pseudo-label dataset files and "pseudo_label_data.data_ratio"
to specify a per-batch mixing ratio of pseudo-label examples to
labeled dataset examples.
Args:
params: A config_definitions.DataConfig object.
dataset_fn: A `tf.data.Dataset` that consumes the input files. For
example, it can be `tf.data.TFRecordDataset`.
pseudo_label_dataset_fn: A `tf.data.Dataset` that consumes the input
files. For example, it can be `tf.data.TFRecordDataset`.
decoder_fn: An optional `callable` that takes the serialized data string
and decodes them into the raw tensor dictionary.
sample_fn: An optional `callable` that takes a `tf.data.Dataset` object as
input and outputs the transformed dataset. It performs sampling on the
decoded raw tensors dict before the parser_fn.
parser_fn: An optional `callable` that takes the decoded raw tensors dict
and parse them into a dictionary of tensors that can be consumed by the
model. It will be executed after decoder_fn.
transform_and_batch_fn: An optional `callable` that takes a
`tf.data.Dataset` object and an optional `tf.distribute.InputContext` as
input, and returns a `tf.data.Dataset` object. It will be executed after
`parser_fn` to transform and batch the dataset; if None, after
`parser_fn` is executed, the dataset will be batched into per-replica
batch size.
postprocess_fn: A optional `callable` that processes batched tensors. It
will be executed after batching.
Raises:
ValueError: If drop_remainder is False.
"""
super
().
__init__
(
params
=
params
,
dataset_fn
=
dataset_fn
,
decoder_fn
=
decoder_fn
,
sample_fn
=
sample_fn
,
parser_fn
=
parser_fn
,
transform_and_batch_fn
=
transform_and_batch_fn
,
postprocess_fn
=
postprocess_fn
)
self
.
_pseudo_label_file_pattern
=
params
.
pseudo_label_data
.
input_path
self
.
_pseudo_label_dataset_fn
=
pseudo_label_dataset_fn
self
.
_pseudo_label_data_ratio
=
params
.
pseudo_label_data
.
data_ratio
self
.
_pseudo_label_matched_files
=
input_reader
.
match_files
(
self
.
_pseudo_label_file_pattern
)
if
not
self
.
_drop_remainder
:
raise
ValueError
(
'Must use drop_remainder=True with CombinationDatasetInputReader'
)
def
read
(
self
,
input_context
:
Optional
[
tf
.
distribute
.
InputContext
]
=
None
)
->
tf
.
data
.
Dataset
:
"""Generates a tf.data.Dataset object."""
labeled_batch_size
,
pl_batch_size
=
calculate_batch_sizes
(
self
.
_global_batch_size
,
self
.
_pseudo_label_data_ratio
)
if
not
labeled_batch_size
and
pl_batch_size
:
raise
ValueError
(
'Invalid batch_size: {} and pseudo_label_data_ratio: {}, '
'resulting in a 0 batch size for one of the datasets.'
.
format
(
self
.
_global_batch_size
,
self
.
_pseudo_label_data_ratio
))
def
_read_decode_and_parse_dataset
(
matched_files
,
dataset_fn
,
batch_size
,
input_context
,
tfds_builder
):
dataset
=
self
.
_read_data_source
(
matched_files
,
dataset_fn
,
input_context
,
tfds_builder
)
return
self
.
_decode_and_parse_dataset
(
dataset
,
batch_size
,
input_context
)
labeled_dataset
=
_read_decode_and_parse_dataset
(
matched_files
=
self
.
_matched_files
,
dataset_fn
=
self
.
_dataset_fn
,
batch_size
=
labeled_batch_size
,
input_context
=
input_context
,
tfds_builder
=
self
.
_tfds_builder
)
pseudo_labeled_dataset
=
_read_decode_and_parse_dataset
(
matched_files
=
self
.
_pseudo_label_matched_files
,
dataset_fn
=
self
.
_pseudo_label_dataset_fn
,
batch_size
=
pl_batch_size
,
input_context
=
input_context
,
tfds_builder
=
False
)
def
concat_fn
(
d1
,
d2
):
return
tf
.
nest
.
map_structure
(
lambda
x1
,
x2
:
tf
.
concat
([
x1
,
x2
],
axis
=
0
),
d1
,
d2
)
dataset_concat
=
tf
.
data
.
Dataset
.
zip
(
(
labeled_dataset
,
pseudo_labeled_dataset
))
dataset_concat
=
dataset_concat
.
map
(
concat_fn
,
num_parallel_calls
=
tf
.
data
.
experimental
.
AUTOTUNE
)
def
maybe_map_fn
(
dataset
,
fn
):
return
dataset
if
fn
is
None
else
dataset
.
map
(
fn
,
num_parallel_calls
=
tf
.
data
.
experimental
.
AUTOTUNE
)
dataset_concat
=
maybe_map_fn
(
dataset_concat
,
self
.
_postprocess_fn
)
dataset_concat
=
self
.
_maybe_apply_data_service
(
dataset_concat
,
input_context
)
if
self
.
_deterministic
is
not
None
:
options
=
tf
.
data
.
Options
()
options
.
experimental_deterministic
=
self
.
_deterministic
dataset_concat
=
dataset_concat
.
with_options
(
options
)
return
dataset_concat
.
prefetch
(
tf
.
data
.
experimental
.
AUTOTUNE
)
Prev
1
…
4
5
6
7
8
9
10
11
12
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment