Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
e4be7e00
Commit
e4be7e00
authored
Mar 25, 2022
by
Yeqing Li
Committed by
A. Unique TensorFlower
Mar 25, 2022
Browse files
Removes unneeded content of the beta folder.
PiperOrigin-RevId: 437276665
parent
f47405b5
Changes
235
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
3782 deletions
+0
-3782
official/vision/beta/configs/semantic_segmentation.py
official/vision/beta/configs/semantic_segmentation.py
+0
-712
official/vision/beta/configs/semantic_segmentation_test.py
official/vision/beta/configs/semantic_segmentation_test.py
+0
-45
official/vision/beta/configs/video_classification.py
official/vision/beta/configs/video_classification.py
+0
-370
official/vision/beta/configs/video_classification_test.py
official/vision/beta/configs/video_classification_test.py
+0
-44
official/vision/beta/data/__init__.py
official/vision/beta/data/__init__.py
+0
-14
official/vision/beta/data/create_coco_tf_record.py
official/vision/beta/data/create_coco_tf_record.py
+0
-554
official/vision/beta/data/process_coco_few_shot.sh
official/vision/beta/data/process_coco_few_shot.sh
+0
-70
official/vision/beta/data/process_coco_few_shot_json_files.py
...cial/vision/beta/data/process_coco_few_shot_json_files.py
+0
-144
official/vision/beta/data/process_coco_panoptic.sh
official/vision/beta/data/process_coco_panoptic.sh
+0
-40
official/vision/beta/data/tfrecord_lib.py
official/vision/beta/data/tfrecord_lib.py
+0
-181
official/vision/beta/data/tfrecord_lib_test.py
official/vision/beta/data/tfrecord_lib_test.py
+0
-93
official/vision/beta/dataloaders/__init__.py
official/vision/beta/dataloaders/__init__.py
+0
-14
official/vision/beta/dataloaders/classification_input.py
official/vision/beta/dataloaders/classification_input.py
+0
-273
official/vision/beta/dataloaders/decoder.py
official/vision/beta/dataloaders/decoder.py
+0
-35
official/vision/beta/dataloaders/input_reader.py
official/vision/beta/dataloaders/input_reader.py
+0
-178
official/vision/beta/dataloaders/input_reader_factory.py
official/vision/beta/dataloaders/input_reader_factory.py
+0
-43
official/vision/beta/dataloaders/maskrcnn_input.py
official/vision/beta/dataloaders/maskrcnn_input.py
+0
-345
official/vision/beta/dataloaders/parser.py
official/vision/beta/dataloaders/parser.py
+0
-81
official/vision/beta/dataloaders/retinanet_input.py
official/vision/beta/dataloaders/retinanet_input.py
+0
-328
official/vision/beta/dataloaders/segmentation_input.py
official/vision/beta/dataloaders/segmentation_input.py
+0
-218
No files found.
official/vision/beta/configs/semantic_segmentation.py
deleted
100644 → 0
View file @
f47405b5
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Semantic segmentation configuration definition."""
import
dataclasses
import
os
from
typing
import
List
,
Optional
,
Union
import
numpy
as
np
from
official.core
import
config_definitions
as
cfg
from
official.core
import
exp_factory
from
official.modeling
import
hyperparams
from
official.modeling
import
optimization
from
official.vision.beta.configs
import
common
from
official.vision.beta.configs
import
decoders
from
official.vision.beta.configs
import
backbones
@
dataclasses
.
dataclass
class
DataConfig
(
cfg
.
DataConfig
):
"""Input config for training."""
output_size
:
List
[
int
]
=
dataclasses
.
field
(
default_factory
=
list
)
# If crop_size is specified, image will be resized first to
# output_size, then crop of size crop_size will be cropped.
crop_size
:
List
[
int
]
=
dataclasses
.
field
(
default_factory
=
list
)
input_path
:
str
=
''
global_batch_size
:
int
=
0
is_training
:
bool
=
True
dtype
:
str
=
'float32'
shuffle_buffer_size
:
int
=
1000
cycle_length
:
int
=
10
# If resize_eval_groundtruth is set to False, original image sizes are used
# for eval. In that case, groundtruth_padded_size has to be specified too to
# allow for batching the variable input sizes of images.
resize_eval_groundtruth
:
bool
=
True
groundtruth_padded_size
:
List
[
int
]
=
dataclasses
.
field
(
default_factory
=
list
)
aug_scale_min
:
float
=
1.0
aug_scale_max
:
float
=
1.0
aug_rand_hflip
:
bool
=
True
preserve_aspect_ratio
:
bool
=
True
aug_policy
:
Optional
[
str
]
=
None
drop_remainder
:
bool
=
True
file_type
:
str
=
'tfrecord'
decoder
:
Optional
[
common
.
DataDecoder
]
=
common
.
DataDecoder
()
@
dataclasses
.
dataclass
class
SegmentationHead
(
hyperparams
.
Config
):
"""Segmentation head config."""
level
:
int
=
3
num_convs
:
int
=
2
num_filters
:
int
=
256
use_depthwise_convolution
:
bool
=
False
prediction_kernel_size
:
int
=
1
upsample_factor
:
int
=
1
feature_fusion
:
Optional
[
str
]
=
None
# None, deeplabv3plus, panoptic_fpn_fusion or pyramid_fusion
# deeplabv3plus feature fusion params
low_level
:
Union
[
int
,
str
]
=
2
low_level_num_filters
:
int
=
48
# panoptic_fpn_fusion params
decoder_min_level
:
Optional
[
Union
[
int
,
str
]]
=
None
decoder_max_level
:
Optional
[
Union
[
int
,
str
]]
=
None
@
dataclasses
.
dataclass
class
MaskScoringHead
(
hyperparams
.
Config
):
"""Mask Scoring head config."""
num_convs
:
int
=
4
num_filters
:
int
=
128
fc_input_size
:
List
[
int
]
=
dataclasses
.
field
(
default_factory
=
list
)
num_fcs
:
int
=
2
fc_dims
:
int
=
1024
@
dataclasses
.
dataclass
class
SemanticSegmentationModel
(
hyperparams
.
Config
):
"""Semantic segmentation model config."""
num_classes
:
int
=
0
input_size
:
List
[
int
]
=
dataclasses
.
field
(
default_factory
=
list
)
min_level
:
int
=
3
max_level
:
int
=
6
head
:
SegmentationHead
=
SegmentationHead
()
backbone
:
backbones
.
Backbone
=
backbones
.
Backbone
(
type
=
'resnet'
,
resnet
=
backbones
.
ResNet
())
decoder
:
decoders
.
Decoder
=
decoders
.
Decoder
(
type
=
'identity'
)
mask_scoring_head
:
Optional
[
MaskScoringHead
]
=
None
norm_activation
:
common
.
NormActivation
=
common
.
NormActivation
()
@
dataclasses
.
dataclass
class
Losses
(
hyperparams
.
Config
):
loss_weight
:
float
=
1.0
label_smoothing
:
float
=
0.0
ignore_label
:
int
=
255
class_weights
:
List
[
float
]
=
dataclasses
.
field
(
default_factory
=
list
)
l2_weight_decay
:
float
=
0.0
use_groundtruth_dimension
:
bool
=
True
top_k_percent_pixels
:
float
=
1.0
@
dataclasses
.
dataclass
class
Evaluation
(
hyperparams
.
Config
):
report_per_class_iou
:
bool
=
True
report_train_mean_iou
:
bool
=
True
# Turning this off can speed up training.
@
dataclasses
.
dataclass
class
SemanticSegmentationTask
(
cfg
.
TaskConfig
):
"""The model config."""
model
:
SemanticSegmentationModel
=
SemanticSegmentationModel
()
train_data
:
DataConfig
=
DataConfig
(
is_training
=
True
)
validation_data
:
DataConfig
=
DataConfig
(
is_training
=
False
)
losses
:
Losses
=
Losses
()
evaluation
:
Evaluation
=
Evaluation
()
train_input_partition_dims
:
List
[
int
]
=
dataclasses
.
field
(
default_factory
=
list
)
eval_input_partition_dims
:
List
[
int
]
=
dataclasses
.
field
(
default_factory
=
list
)
init_checkpoint
:
Optional
[
str
]
=
None
init_checkpoint_modules
:
Union
[
str
,
List
[
str
]]
=
'all'
# all, backbone, and/or decoder
@
exp_factory
.
register_config_factory
(
'semantic_segmentation'
)
def
semantic_segmentation
()
->
cfg
.
ExperimentConfig
:
"""Semantic segmentation general."""
return
cfg
.
ExperimentConfig
(
task
=
SemanticSegmentationTask
(),
trainer
=
cfg
.
TrainerConfig
(),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
])
# PASCAL VOC 2012 Dataset
PASCAL_TRAIN_EXAMPLES
=
10582
PASCAL_VAL_EXAMPLES
=
1449
PASCAL_INPUT_PATH_BASE
=
'gs://**/pascal_voc_seg'
@
exp_factory
.
register_config_factory
(
'seg_deeplabv3_pascal'
)
def
seg_deeplabv3_pascal
()
->
cfg
.
ExperimentConfig
:
"""Image segmentation on pascal voc with resnet deeplabv3."""
train_batch_size
=
16
eval_batch_size
=
8
steps_per_epoch
=
PASCAL_TRAIN_EXAMPLES
//
train_batch_size
output_stride
=
16
aspp_dilation_rates
=
[
12
,
24
,
36
]
# [6, 12, 18] if output_stride = 16
multigrid
=
[
1
,
2
,
4
]
stem_type
=
'v1'
level
=
int
(
np
.
math
.
log2
(
output_stride
))
config
=
cfg
.
ExperimentConfig
(
task
=
SemanticSegmentationTask
(
model
=
SemanticSegmentationModel
(
num_classes
=
21
,
input_size
=
[
None
,
None
,
3
],
backbone
=
backbones
.
Backbone
(
type
=
'dilated_resnet'
,
dilated_resnet
=
backbones
.
DilatedResNet
(
model_id
=
101
,
output_stride
=
output_stride
,
multigrid
=
multigrid
,
stem_type
=
stem_type
)),
decoder
=
decoders
.
Decoder
(
type
=
'aspp'
,
aspp
=
decoders
.
ASPP
(
level
=
level
,
dilation_rates
=
aspp_dilation_rates
)),
head
=
SegmentationHead
(
level
=
level
,
num_convs
=
0
),
norm_activation
=
common
.
NormActivation
(
activation
=
'swish'
,
norm_momentum
=
0.9997
,
norm_epsilon
=
1e-3
,
use_sync_bn
=
True
)),
losses
=
Losses
(
l2_weight_decay
=
1e-4
),
train_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
PASCAL_INPUT_PATH_BASE
,
'train_aug*'
),
# TODO(arashwan): test changing size to 513 to match deeplab.
output_size
=
[
512
,
512
],
is_training
=
True
,
global_batch_size
=
train_batch_size
,
aug_scale_min
=
0.5
,
aug_scale_max
=
2.0
),
validation_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
PASCAL_INPUT_PATH_BASE
,
'val*'
),
output_size
=
[
512
,
512
],
is_training
=
False
,
global_batch_size
=
eval_batch_size
,
resize_eval_groundtruth
=
False
,
groundtruth_padded_size
=
[
512
,
512
],
drop_remainder
=
False
),
# resnet101
init_checkpoint
=
'gs://cloud-tpu-checkpoints/vision-2.0/deeplab/deeplab_resnet101_imagenet/ckpt-62400'
,
init_checkpoint_modules
=
'backbone'
),
trainer
=
cfg
.
TrainerConfig
(
steps_per_loop
=
steps_per_epoch
,
summary_interval
=
steps_per_epoch
,
checkpoint_interval
=
steps_per_epoch
,
train_steps
=
45
*
steps_per_epoch
,
validation_steps
=
PASCAL_VAL_EXAMPLES
//
eval_batch_size
,
validation_interval
=
steps_per_epoch
,
optimizer_config
=
optimization
.
OptimizationConfig
({
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'polynomial'
,
'polynomial'
:
{
'initial_learning_rate'
:
0.007
,
'decay_steps'
:
45
*
steps_per_epoch
,
'end_learning_rate'
:
0.0
,
'power'
:
0.9
}
},
'warmup'
:
{
'type'
:
'linear'
,
'linear'
:
{
'warmup_steps'
:
5
*
steps_per_epoch
,
'warmup_learning_rate'
:
0
}
}
})),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
])
return
config
@
exp_factory
.
register_config_factory
(
'seg_deeplabv3plus_pascal'
)
def
seg_deeplabv3plus_pascal
()
->
cfg
.
ExperimentConfig
:
"""Image segmentation on pascal voc with resnet deeplabv3+."""
train_batch_size
=
16
eval_batch_size
=
8
steps_per_epoch
=
PASCAL_TRAIN_EXAMPLES
//
train_batch_size
output_stride
=
16
aspp_dilation_rates
=
[
6
,
12
,
18
]
multigrid
=
[
1
,
2
,
4
]
stem_type
=
'v1'
level
=
int
(
np
.
math
.
log2
(
output_stride
))
config
=
cfg
.
ExperimentConfig
(
task
=
SemanticSegmentationTask
(
model
=
SemanticSegmentationModel
(
num_classes
=
21
,
input_size
=
[
None
,
None
,
3
],
backbone
=
backbones
.
Backbone
(
type
=
'dilated_resnet'
,
dilated_resnet
=
backbones
.
DilatedResNet
(
model_id
=
101
,
output_stride
=
output_stride
,
stem_type
=
stem_type
,
multigrid
=
multigrid
)),
decoder
=
decoders
.
Decoder
(
type
=
'aspp'
,
aspp
=
decoders
.
ASPP
(
level
=
level
,
dilation_rates
=
aspp_dilation_rates
)),
head
=
SegmentationHead
(
level
=
level
,
num_convs
=
2
,
feature_fusion
=
'deeplabv3plus'
,
low_level
=
2
,
low_level_num_filters
=
48
),
norm_activation
=
common
.
NormActivation
(
activation
=
'swish'
,
norm_momentum
=
0.9997
,
norm_epsilon
=
1e-3
,
use_sync_bn
=
True
)),
losses
=
Losses
(
l2_weight_decay
=
1e-4
),
train_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
PASCAL_INPUT_PATH_BASE
,
'train_aug*'
),
output_size
=
[
512
,
512
],
is_training
=
True
,
global_batch_size
=
train_batch_size
,
aug_scale_min
=
0.5
,
aug_scale_max
=
2.0
),
validation_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
PASCAL_INPUT_PATH_BASE
,
'val*'
),
output_size
=
[
512
,
512
],
is_training
=
False
,
global_batch_size
=
eval_batch_size
,
resize_eval_groundtruth
=
False
,
groundtruth_padded_size
=
[
512
,
512
],
drop_remainder
=
False
),
# resnet101
init_checkpoint
=
'gs://cloud-tpu-checkpoints/vision-2.0/deeplab/deeplab_resnet101_imagenet/ckpt-62400'
,
init_checkpoint_modules
=
'backbone'
),
trainer
=
cfg
.
TrainerConfig
(
steps_per_loop
=
steps_per_epoch
,
summary_interval
=
steps_per_epoch
,
checkpoint_interval
=
steps_per_epoch
,
train_steps
=
45
*
steps_per_epoch
,
validation_steps
=
PASCAL_VAL_EXAMPLES
//
eval_batch_size
,
validation_interval
=
steps_per_epoch
,
optimizer_config
=
optimization
.
OptimizationConfig
({
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'polynomial'
,
'polynomial'
:
{
'initial_learning_rate'
:
0.007
,
'decay_steps'
:
45
*
steps_per_epoch
,
'end_learning_rate'
:
0.0
,
'power'
:
0.9
}
},
'warmup'
:
{
'type'
:
'linear'
,
'linear'
:
{
'warmup_steps'
:
5
*
steps_per_epoch
,
'warmup_learning_rate'
:
0
}
}
})),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
])
return
config
@
exp_factory
.
register_config_factory
(
'seg_resnetfpn_pascal'
)
def
seg_resnetfpn_pascal
()
->
cfg
.
ExperimentConfig
:
"""Image segmentation on pascal voc with resnet-fpn."""
train_batch_size
=
256
eval_batch_size
=
32
steps_per_epoch
=
PASCAL_TRAIN_EXAMPLES
//
train_batch_size
config
=
cfg
.
ExperimentConfig
(
task
=
SemanticSegmentationTask
(
model
=
SemanticSegmentationModel
(
num_classes
=
21
,
input_size
=
[
512
,
512
,
3
],
min_level
=
3
,
max_level
=
7
,
backbone
=
backbones
.
Backbone
(
type
=
'resnet'
,
resnet
=
backbones
.
ResNet
(
model_id
=
50
)),
decoder
=
decoders
.
Decoder
(
type
=
'fpn'
,
fpn
=
decoders
.
FPN
()),
head
=
SegmentationHead
(
level
=
3
,
num_convs
=
3
),
norm_activation
=
common
.
NormActivation
(
activation
=
'swish'
,
use_sync_bn
=
True
)),
losses
=
Losses
(
l2_weight_decay
=
1e-4
),
train_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
PASCAL_INPUT_PATH_BASE
,
'train_aug*'
),
is_training
=
True
,
global_batch_size
=
train_batch_size
,
aug_scale_min
=
0.2
,
aug_scale_max
=
1.5
),
validation_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
PASCAL_INPUT_PATH_BASE
,
'val*'
),
is_training
=
False
,
global_batch_size
=
eval_batch_size
,
resize_eval_groundtruth
=
False
,
groundtruth_padded_size
=
[
512
,
512
],
drop_remainder
=
False
),
),
trainer
=
cfg
.
TrainerConfig
(
steps_per_loop
=
steps_per_epoch
,
summary_interval
=
steps_per_epoch
,
checkpoint_interval
=
steps_per_epoch
,
train_steps
=
450
*
steps_per_epoch
,
validation_steps
=
PASCAL_VAL_EXAMPLES
//
eval_batch_size
,
validation_interval
=
steps_per_epoch
,
optimizer_config
=
optimization
.
OptimizationConfig
({
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'polynomial'
,
'polynomial'
:
{
'initial_learning_rate'
:
0.007
,
'decay_steps'
:
450
*
steps_per_epoch
,
'end_learning_rate'
:
0.0
,
'power'
:
0.9
}
},
'warmup'
:
{
'type'
:
'linear'
,
'linear'
:
{
'warmup_steps'
:
5
*
steps_per_epoch
,
'warmup_learning_rate'
:
0
}
}
})),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
])
return
config
@
exp_factory
.
register_config_factory
(
'mnv2_deeplabv3_pascal'
)
def
mnv2_deeplabv3_pascal
()
->
cfg
.
ExperimentConfig
:
"""Image segmentation on pascal with mobilenetv2 deeplabv3."""
train_batch_size
=
16
eval_batch_size
=
16
steps_per_epoch
=
PASCAL_TRAIN_EXAMPLES
//
train_batch_size
output_stride
=
16
aspp_dilation_rates
=
[]
level
=
int
(
np
.
math
.
log2
(
output_stride
))
pool_kernel_size
=
[]
config
=
cfg
.
ExperimentConfig
(
task
=
SemanticSegmentationTask
(
model
=
SemanticSegmentationModel
(
num_classes
=
21
,
input_size
=
[
None
,
None
,
3
],
backbone
=
backbones
.
Backbone
(
type
=
'mobilenet'
,
mobilenet
=
backbones
.
MobileNet
(
model_id
=
'MobileNetV2'
,
output_stride
=
output_stride
)),
decoder
=
decoders
.
Decoder
(
type
=
'aspp'
,
aspp
=
decoders
.
ASPP
(
level
=
level
,
dilation_rates
=
aspp_dilation_rates
,
pool_kernel_size
=
pool_kernel_size
)),
head
=
SegmentationHead
(
level
=
level
,
num_convs
=
0
),
norm_activation
=
common
.
NormActivation
(
activation
=
'relu'
,
norm_momentum
=
0.99
,
norm_epsilon
=
1e-3
,
use_sync_bn
=
True
)),
losses
=
Losses
(
l2_weight_decay
=
4e-5
),
train_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
PASCAL_INPUT_PATH_BASE
,
'train_aug*'
),
output_size
=
[
512
,
512
],
is_training
=
True
,
global_batch_size
=
train_batch_size
,
aug_scale_min
=
0.5
,
aug_scale_max
=
2.0
),
validation_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
PASCAL_INPUT_PATH_BASE
,
'val*'
),
output_size
=
[
512
,
512
],
is_training
=
False
,
global_batch_size
=
eval_batch_size
,
resize_eval_groundtruth
=
False
,
groundtruth_padded_size
=
[
512
,
512
],
drop_remainder
=
False
),
# mobilenetv2
init_checkpoint
=
'gs://tf_model_garden/cloud/vision-2.0/deeplab/deeplabv3_mobilenetv2_coco/best_ckpt-63'
,
init_checkpoint_modules
=
[
'backbone'
,
'decoder'
]),
trainer
=
cfg
.
TrainerConfig
(
steps_per_loop
=
steps_per_epoch
,
summary_interval
=
steps_per_epoch
,
checkpoint_interval
=
steps_per_epoch
,
train_steps
=
30000
,
validation_steps
=
PASCAL_VAL_EXAMPLES
//
eval_batch_size
,
validation_interval
=
steps_per_epoch
,
best_checkpoint_eval_metric
=
'mean_iou'
,
best_checkpoint_export_subdir
=
'best_ckpt'
,
best_checkpoint_metric_comp
=
'higher'
,
optimizer_config
=
optimization
.
OptimizationConfig
({
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'polynomial'
,
'polynomial'
:
{
'initial_learning_rate'
:
0.007
*
train_batch_size
/
16
,
'decay_steps'
:
30000
,
'end_learning_rate'
:
0.0
,
'power'
:
0.9
}
},
'warmup'
:
{
'type'
:
'linear'
,
'linear'
:
{
'warmup_steps'
:
5
*
steps_per_epoch
,
'warmup_learning_rate'
:
0
}
}
})),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
])
return
config
# Cityscapes Dataset (Download and process the dataset yourself)
CITYSCAPES_TRAIN_EXAMPLES
=
2975
CITYSCAPES_VAL_EXAMPLES
=
500
CITYSCAPES_INPUT_PATH_BASE
=
'cityscapes'
@
exp_factory
.
register_config_factory
(
'seg_deeplabv3plus_cityscapes'
)
def
seg_deeplabv3plus_cityscapes
()
->
cfg
.
ExperimentConfig
:
"""Image segmentation on cityscapes with resnet deeplabv3+."""
train_batch_size
=
16
eval_batch_size
=
16
steps_per_epoch
=
CITYSCAPES_TRAIN_EXAMPLES
//
train_batch_size
output_stride
=
16
aspp_dilation_rates
=
[
6
,
12
,
18
]
multigrid
=
[
1
,
2
,
4
]
stem_type
=
'v1'
level
=
int
(
np
.
math
.
log2
(
output_stride
))
config
=
cfg
.
ExperimentConfig
(
task
=
SemanticSegmentationTask
(
model
=
SemanticSegmentationModel
(
# Cityscapes uses only 19 semantic classes for train/evaluation.
# The void (background) class is ignored in train and evaluation.
num_classes
=
19
,
input_size
=
[
None
,
None
,
3
],
backbone
=
backbones
.
Backbone
(
type
=
'dilated_resnet'
,
dilated_resnet
=
backbones
.
DilatedResNet
(
model_id
=
101
,
output_stride
=
output_stride
,
stem_type
=
stem_type
,
multigrid
=
multigrid
)),
decoder
=
decoders
.
Decoder
(
type
=
'aspp'
,
aspp
=
decoders
.
ASPP
(
level
=
level
,
dilation_rates
=
aspp_dilation_rates
,
pool_kernel_size
=
[
512
,
1024
])),
head
=
SegmentationHead
(
level
=
level
,
num_convs
=
2
,
feature_fusion
=
'deeplabv3plus'
,
low_level
=
2
,
low_level_num_filters
=
48
),
norm_activation
=
common
.
NormActivation
(
activation
=
'swish'
,
norm_momentum
=
0.99
,
norm_epsilon
=
1e-3
,
use_sync_bn
=
True
)),
losses
=
Losses
(
l2_weight_decay
=
1e-4
),
train_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
CITYSCAPES_INPUT_PATH_BASE
,
'train_fine**'
),
crop_size
=
[
512
,
1024
],
output_size
=
[
1024
,
2048
],
is_training
=
True
,
global_batch_size
=
train_batch_size
,
aug_scale_min
=
0.5
,
aug_scale_max
=
2.0
),
validation_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
CITYSCAPES_INPUT_PATH_BASE
,
'val_fine*'
),
output_size
=
[
1024
,
2048
],
is_training
=
False
,
global_batch_size
=
eval_batch_size
,
resize_eval_groundtruth
=
True
,
drop_remainder
=
False
),
# resnet101
init_checkpoint
=
'gs://cloud-tpu-checkpoints/vision-2.0/deeplab/deeplab_resnet101_imagenet/ckpt-62400'
,
init_checkpoint_modules
=
'backbone'
),
trainer
=
cfg
.
TrainerConfig
(
steps_per_loop
=
steps_per_epoch
,
summary_interval
=
steps_per_epoch
,
checkpoint_interval
=
steps_per_epoch
,
train_steps
=
500
*
steps_per_epoch
,
validation_steps
=
CITYSCAPES_VAL_EXAMPLES
//
eval_batch_size
,
validation_interval
=
steps_per_epoch
,
optimizer_config
=
optimization
.
OptimizationConfig
({
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'polynomial'
,
'polynomial'
:
{
'initial_learning_rate'
:
0.01
,
'decay_steps'
:
500
*
steps_per_epoch
,
'end_learning_rate'
:
0.0
,
'power'
:
0.9
}
},
'warmup'
:
{
'type'
:
'linear'
,
'linear'
:
{
'warmup_steps'
:
5
*
steps_per_epoch
,
'warmup_learning_rate'
:
0
}
}
})),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
])
return
config
@
exp_factory
.
register_config_factory
(
'mnv2_deeplabv3_cityscapes'
)
def
mnv2_deeplabv3_cityscapes
()
->
cfg
.
ExperimentConfig
:
"""Image segmentation on cityscapes with mobilenetv2 deeplabv3."""
train_batch_size
=
16
eval_batch_size
=
16
steps_per_epoch
=
CITYSCAPES_TRAIN_EXAMPLES
//
train_batch_size
output_stride
=
16
aspp_dilation_rates
=
[]
pool_kernel_size
=
[
512
,
1024
]
level
=
int
(
np
.
math
.
log2
(
output_stride
))
config
=
cfg
.
ExperimentConfig
(
task
=
SemanticSegmentationTask
(
model
=
SemanticSegmentationModel
(
# Cityscapes uses only 19 semantic classes for train/evaluation.
# The void (background) class is ignored in train and evaluation.
num_classes
=
19
,
input_size
=
[
None
,
None
,
3
],
backbone
=
backbones
.
Backbone
(
type
=
'mobilenet'
,
mobilenet
=
backbones
.
MobileNet
(
model_id
=
'MobileNetV2'
,
output_stride
=
output_stride
)),
decoder
=
decoders
.
Decoder
(
type
=
'aspp'
,
aspp
=
decoders
.
ASPP
(
level
=
level
,
dilation_rates
=
aspp_dilation_rates
,
pool_kernel_size
=
pool_kernel_size
)),
head
=
SegmentationHead
(
level
=
level
,
num_convs
=
0
),
norm_activation
=
common
.
NormActivation
(
activation
=
'relu'
,
norm_momentum
=
0.99
,
norm_epsilon
=
1e-3
,
use_sync_bn
=
True
)),
losses
=
Losses
(
l2_weight_decay
=
4e-5
),
train_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
CITYSCAPES_INPUT_PATH_BASE
,
'train_fine**'
),
crop_size
=
[
512
,
1024
],
output_size
=
[
1024
,
2048
],
is_training
=
True
,
global_batch_size
=
train_batch_size
,
aug_scale_min
=
0.5
,
aug_scale_max
=
2.0
),
validation_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
CITYSCAPES_INPUT_PATH_BASE
,
'val_fine*'
),
output_size
=
[
1024
,
2048
],
is_training
=
False
,
global_batch_size
=
eval_batch_size
,
resize_eval_groundtruth
=
True
,
drop_remainder
=
False
),
# Coco pre-trained mobilenetv2 checkpoint
init_checkpoint
=
'gs://tf_model_garden/cloud/vision-2.0/deeplab/deeplabv3_mobilenetv2_coco/best_ckpt-63'
,
init_checkpoint_modules
=
'backbone'
),
trainer
=
cfg
.
TrainerConfig
(
steps_per_loop
=
steps_per_epoch
,
summary_interval
=
steps_per_epoch
,
checkpoint_interval
=
steps_per_epoch
,
train_steps
=
100000
,
validation_steps
=
CITYSCAPES_VAL_EXAMPLES
//
eval_batch_size
,
validation_interval
=
steps_per_epoch
,
best_checkpoint_eval_metric
=
'mean_iou'
,
best_checkpoint_export_subdir
=
'best_ckpt'
,
best_checkpoint_metric_comp
=
'higher'
,
optimizer_config
=
optimization
.
OptimizationConfig
({
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'polynomial'
,
'polynomial'
:
{
'initial_learning_rate'
:
0.01
,
'decay_steps'
:
100000
,
'end_learning_rate'
:
0.0
,
'power'
:
0.9
}
},
'warmup'
:
{
'type'
:
'linear'
,
'linear'
:
{
'warmup_steps'
:
5
*
steps_per_epoch
,
'warmup_learning_rate'
:
0
}
}
})),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
])
return
config
@
exp_factory
.
register_config_factory
(
'mnv2_deeplabv3plus_cityscapes'
)
def
mnv2_deeplabv3plus_cityscapes
()
->
cfg
.
ExperimentConfig
:
"""Image segmentation on cityscapes with mobilenetv2 deeplabv3plus."""
config
=
mnv2_deeplabv3_cityscapes
()
config
.
task
.
model
.
head
=
SegmentationHead
(
level
=
4
,
num_convs
=
2
,
feature_fusion
=
'deeplabv3plus'
,
use_depthwise_convolution
=
True
,
low_level
=
'2/depthwise'
,
low_level_num_filters
=
48
)
config
.
task
.
model
.
backbone
.
mobilenet
.
output_intermediate_endpoints
=
True
return
config
official/vision/beta/configs/semantic_segmentation_test.py
deleted
100644 → 0
View file @
f47405b5
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for semantic_segmentation."""
# pylint: disable=unused-import
from
absl.testing
import
parameterized
import
tensorflow
as
tf
from
official.core
import
config_definitions
as
cfg
from
official.core
import
exp_factory
from
official.vision
import
beta
from
official.vision.beta.configs
import
semantic_segmentation
as
exp_cfg
class
ImageSegmentationConfigTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
@
parameterized
.
parameters
((
'seg_deeplabv3_pascal'
,),
(
'seg_deeplabv3plus_pascal'
,))
def
test_semantic_segmentation_configs
(
self
,
config_name
):
config
=
exp_factory
.
get_exp_config
(
config_name
)
self
.
assertIsInstance
(
config
,
cfg
.
ExperimentConfig
)
self
.
assertIsInstance
(
config
.
task
,
exp_cfg
.
SemanticSegmentationTask
)
self
.
assertIsInstance
(
config
.
task
.
model
,
exp_cfg
.
SemanticSegmentationModel
)
self
.
assertIsInstance
(
config
.
task
.
train_data
,
exp_cfg
.
DataConfig
)
config
.
validate
()
config
.
task
.
train_data
.
is_training
=
None
with
self
.
assertRaises
(
KeyError
):
config
.
validate
()
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/beta/configs/video_classification.py
deleted
100644 → 0
View file @
f47405b5
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Video classification configuration definition."""
import
dataclasses
from
typing
import
Optional
,
Tuple
from
official.core
import
config_definitions
as
cfg
from
official.core
import
exp_factory
from
official.modeling
import
hyperparams
from
official.modeling
import
optimization
from
official.vision.beta.configs
import
backbones_3d
from
official.vision.beta.configs
import
common
@
dataclasses
.
dataclass
class
DataConfig
(
cfg
.
DataConfig
):
"""The base configuration for building datasets."""
name
:
Optional
[
str
]
=
None
file_type
:
Optional
[
str
]
=
'tfrecord'
compressed_input
:
bool
=
False
split
:
str
=
'train'
variant_name
:
Optional
[
str
]
=
None
feature_shape
:
Tuple
[
int
,
...]
=
(
64
,
224
,
224
,
3
)
temporal_stride
:
int
=
1
random_stride_range
:
int
=
0
num_test_clips
:
int
=
1
num_test_crops
:
int
=
1
num_classes
:
int
=
-
1
num_examples
:
int
=
-
1
global_batch_size
:
int
=
128
data_format
:
str
=
'channels_last'
dtype
:
str
=
'float32'
one_hot
:
bool
=
True
shuffle_buffer_size
:
int
=
64
cache
:
bool
=
False
input_path
:
str
=
''
is_training
:
bool
=
True
cycle_length
:
int
=
10
drop_remainder
:
bool
=
True
min_image_size
:
int
=
256
is_multilabel
:
bool
=
False
output_audio
:
bool
=
False
audio_feature
:
str
=
''
audio_feature_shape
:
Tuple
[
int
,
...]
=
(
-
1
,)
aug_min_aspect_ratio
:
float
=
0.5
aug_max_aspect_ratio
:
float
=
2.0
aug_min_area_ratio
:
float
=
0.49
aug_max_area_ratio
:
float
=
1.0
aug_type
:
Optional
[
str
]
=
None
# 'autoaug', 'randaug', or None
image_field_key
:
str
=
'image/encoded'
label_field_key
:
str
=
'clip/label/index'
def
kinetics400
(
is_training
):
"""Generated Kinectics 400 dataset configs."""
return
DataConfig
(
name
=
'kinetics400'
,
num_classes
=
400
,
is_training
=
is_training
,
split
=
'train'
if
is_training
else
'valid'
,
drop_remainder
=
is_training
,
num_examples
=
215570
if
is_training
else
17706
,
feature_shape
=
(
64
,
224
,
224
,
3
)
if
is_training
else
(
250
,
224
,
224
,
3
))
def
kinetics600
(
is_training
):
"""Generated Kinectics 600 dataset configs."""
return
DataConfig
(
name
=
'kinetics600'
,
num_classes
=
600
,
is_training
=
is_training
,
split
=
'train'
if
is_training
else
'valid'
,
drop_remainder
=
is_training
,
num_examples
=
366016
if
is_training
else
27780
,
feature_shape
=
(
64
,
224
,
224
,
3
)
if
is_training
else
(
250
,
224
,
224
,
3
))
def
kinetics700
(
is_training
):
"""Generated Kinectics 600 dataset configs."""
return
DataConfig
(
name
=
'kinetics700'
,
num_classes
=
700
,
is_training
=
is_training
,
split
=
'train'
if
is_training
else
'valid'
,
drop_remainder
=
is_training
,
num_examples
=
522883
if
is_training
else
33441
,
feature_shape
=
(
64
,
224
,
224
,
3
)
if
is_training
else
(
250
,
224
,
224
,
3
))
def
kinetics700_2020
(
is_training
):
"""Generated Kinectics 600 dataset configs."""
return
DataConfig
(
name
=
'kinetics700'
,
num_classes
=
700
,
is_training
=
is_training
,
split
=
'train'
if
is_training
else
'valid'
,
drop_remainder
=
is_training
,
num_examples
=
535982
if
is_training
else
33640
,
feature_shape
=
(
64
,
224
,
224
,
3
)
if
is_training
else
(
250
,
224
,
224
,
3
))
@
dataclasses
.
dataclass
class
VideoClassificationModel
(
hyperparams
.
Config
):
"""The model config."""
model_type
:
str
=
'video_classification'
backbone
:
backbones_3d
.
Backbone3D
=
backbones_3d
.
Backbone3D
(
type
=
'resnet_3d'
,
resnet_3d
=
backbones_3d
.
ResNet3D50
())
norm_activation
:
common
.
NormActivation
=
common
.
NormActivation
(
use_sync_bn
=
False
)
dropout_rate
:
float
=
0.2
aggregate_endpoints
:
bool
=
False
require_endpoints
:
Optional
[
Tuple
[
str
,
...]]
=
None
@
dataclasses
.
dataclass
class
Losses
(
hyperparams
.
Config
):
one_hot
:
bool
=
True
label_smoothing
:
float
=
0.0
l2_weight_decay
:
float
=
0.0
@
dataclasses
.
dataclass
class
Metrics
(
hyperparams
.
Config
):
use_per_class_recall
:
bool
=
False
@
dataclasses
.
dataclass
class
VideoClassificationTask
(
cfg
.
TaskConfig
):
"""The task config."""
model
:
VideoClassificationModel
=
VideoClassificationModel
()
train_data
:
DataConfig
=
DataConfig
(
is_training
=
True
,
drop_remainder
=
True
)
validation_data
:
DataConfig
=
DataConfig
(
is_training
=
False
,
drop_remainder
=
False
)
losses
:
Losses
=
Losses
()
metrics
:
Metrics
=
Metrics
()
init_checkpoint
:
Optional
[
str
]
=
None
init_checkpoint_modules
:
str
=
'all'
# all or backbone
# Spatial Partitioning fields.
train_input_partition_dims
:
Optional
[
Tuple
[
int
,
...]]
=
None
eval_input_partition_dims
:
Optional
[
Tuple
[
int
,
...]]
=
None
def
add_trainer
(
experiment
:
cfg
.
ExperimentConfig
,
train_batch_size
:
int
,
eval_batch_size
:
int
,
learning_rate
:
float
=
1.6
,
train_epochs
:
int
=
44
,
warmup_epochs
:
int
=
5
):
"""Add and config a trainer to the experiment config."""
if
experiment
.
task
.
train_data
.
num_examples
<=
0
:
raise
ValueError
(
'Wrong train dataset size {!r}'
.
format
(
experiment
.
task
.
train_data
))
if
experiment
.
task
.
validation_data
.
num_examples
<=
0
:
raise
ValueError
(
'Wrong validation dataset size {!r}'
.
format
(
experiment
.
task
.
validation_data
))
experiment
.
task
.
train_data
.
global_batch_size
=
train_batch_size
experiment
.
task
.
validation_data
.
global_batch_size
=
eval_batch_size
steps_per_epoch
=
experiment
.
task
.
train_data
.
num_examples
//
train_batch_size
experiment
.
trainer
=
cfg
.
TrainerConfig
(
steps_per_loop
=
steps_per_epoch
,
summary_interval
=
steps_per_epoch
,
checkpoint_interval
=
steps_per_epoch
,
train_steps
=
train_epochs
*
steps_per_epoch
,
validation_steps
=
experiment
.
task
.
validation_data
.
num_examples
//
eval_batch_size
,
validation_interval
=
steps_per_epoch
,
optimizer_config
=
optimization
.
OptimizationConfig
({
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
,
'nesterov'
:
True
,
}
},
'learning_rate'
:
{
'type'
:
'cosine'
,
'cosine'
:
{
'initial_learning_rate'
:
learning_rate
,
'decay_steps'
:
train_epochs
*
steps_per_epoch
,
}
},
'warmup'
:
{
'type'
:
'linear'
,
'linear'
:
{
'warmup_steps'
:
warmup_epochs
*
steps_per_epoch
,
'warmup_learning_rate'
:
0
}
}
}))
return
experiment
@
exp_factory
.
register_config_factory
(
'video_classification'
)
def
video_classification
()
->
cfg
.
ExperimentConfig
:
"""Video classification general."""
return
cfg
.
ExperimentConfig
(
runtime
=
cfg
.
RuntimeConfig
(
mixed_precision_dtype
=
'bfloat16'
),
task
=
VideoClassificationTask
(),
trainer
=
cfg
.
TrainerConfig
(),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
,
'task.train_data.num_classes == task.validation_data.num_classes'
,
])
@
exp_factory
.
register_config_factory
(
'video_classification_ucf101'
)
def
video_classification_ucf101
()
->
cfg
.
ExperimentConfig
:
"""Video classification on UCF-101 with resnet."""
train_dataset
=
DataConfig
(
name
=
'ucf101'
,
num_classes
=
101
,
is_training
=
True
,
split
=
'train'
,
drop_remainder
=
True
,
num_examples
=
9537
,
temporal_stride
=
2
,
feature_shape
=
(
32
,
224
,
224
,
3
))
train_dataset
.
tfds_name
=
'ucf101'
train_dataset
.
tfds_split
=
'train'
validation_dataset
=
DataConfig
(
name
=
'ucf101'
,
num_classes
=
101
,
is_training
=
True
,
split
=
'test'
,
drop_remainder
=
False
,
num_examples
=
3783
,
temporal_stride
=
2
,
feature_shape
=
(
32
,
224
,
224
,
3
))
validation_dataset
.
tfds_name
=
'ucf101'
validation_dataset
.
tfds_split
=
'test'
task
=
VideoClassificationTask
(
model
=
VideoClassificationModel
(
backbone
=
backbones_3d
.
Backbone3D
(
type
=
'resnet_3d'
,
resnet_3d
=
backbones_3d
.
ResNet3D50
()),
norm_activation
=
common
.
NormActivation
(
norm_momentum
=
0.9
,
norm_epsilon
=
1e-5
,
use_sync_bn
=
False
)),
losses
=
Losses
(
l2_weight_decay
=
1e-4
),
train_data
=
train_dataset
,
validation_data
=
validation_dataset
)
config
=
cfg
.
ExperimentConfig
(
runtime
=
cfg
.
RuntimeConfig
(
mixed_precision_dtype
=
'bfloat16'
),
task
=
task
,
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
,
'task.train_data.num_classes == task.validation_data.num_classes'
,
])
add_trainer
(
config
,
train_batch_size
=
64
,
eval_batch_size
=
16
,
learning_rate
=
0.8
,
train_epochs
=
100
)
return
config
@
exp_factory
.
register_config_factory
(
'video_classification_kinetics400'
)
def
video_classification_kinetics400
()
->
cfg
.
ExperimentConfig
:
"""Video classification on Kinectics 400 with resnet."""
train_dataset
=
kinetics400
(
is_training
=
True
)
validation_dataset
=
kinetics400
(
is_training
=
False
)
task
=
VideoClassificationTask
(
model
=
VideoClassificationModel
(
backbone
=
backbones_3d
.
Backbone3D
(
type
=
'resnet_3d'
,
resnet_3d
=
backbones_3d
.
ResNet3D50
()),
norm_activation
=
common
.
NormActivation
(
norm_momentum
=
0.9
,
norm_epsilon
=
1e-5
,
use_sync_bn
=
False
)),
losses
=
Losses
(
l2_weight_decay
=
1e-4
),
train_data
=
train_dataset
,
validation_data
=
validation_dataset
)
config
=
cfg
.
ExperimentConfig
(
runtime
=
cfg
.
RuntimeConfig
(
mixed_precision_dtype
=
'bfloat16'
),
task
=
task
,
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
,
'task.train_data.num_classes == task.validation_data.num_classes'
,
])
add_trainer
(
config
,
train_batch_size
=
1024
,
eval_batch_size
=
64
)
return
config
@
exp_factory
.
register_config_factory
(
'video_classification_kinetics600'
)
def
video_classification_kinetics600
()
->
cfg
.
ExperimentConfig
:
"""Video classification on Kinectics 600 with resnet."""
train_dataset
=
kinetics600
(
is_training
=
True
)
validation_dataset
=
kinetics600
(
is_training
=
False
)
task
=
VideoClassificationTask
(
model
=
VideoClassificationModel
(
backbone
=
backbones_3d
.
Backbone3D
(
type
=
'resnet_3d'
,
resnet_3d
=
backbones_3d
.
ResNet3D50
()),
norm_activation
=
common
.
NormActivation
(
norm_momentum
=
0.9
,
norm_epsilon
=
1e-5
,
use_sync_bn
=
False
)),
losses
=
Losses
(
l2_weight_decay
=
1e-4
),
train_data
=
train_dataset
,
validation_data
=
validation_dataset
)
config
=
cfg
.
ExperimentConfig
(
runtime
=
cfg
.
RuntimeConfig
(
mixed_precision_dtype
=
'bfloat16'
),
task
=
task
,
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
,
'task.train_data.num_classes == task.validation_data.num_classes'
,
])
add_trainer
(
config
,
train_batch_size
=
1024
,
eval_batch_size
=
64
)
return
config
@
exp_factory
.
register_config_factory
(
'video_classification_kinetics700'
)
def
video_classification_kinetics700
()
->
cfg
.
ExperimentConfig
:
"""Video classification on Kinectics 700 with resnet."""
train_dataset
=
kinetics700
(
is_training
=
True
)
validation_dataset
=
kinetics700
(
is_training
=
False
)
task
=
VideoClassificationTask
(
model
=
VideoClassificationModel
(
backbone
=
backbones_3d
.
Backbone3D
(
type
=
'resnet_3d'
,
resnet_3d
=
backbones_3d
.
ResNet3D50
()),
norm_activation
=
common
.
NormActivation
(
norm_momentum
=
0.9
,
norm_epsilon
=
1e-5
,
use_sync_bn
=
False
)),
losses
=
Losses
(
l2_weight_decay
=
1e-4
),
train_data
=
train_dataset
,
validation_data
=
validation_dataset
)
config
=
cfg
.
ExperimentConfig
(
runtime
=
cfg
.
RuntimeConfig
(
mixed_precision_dtype
=
'bfloat16'
),
task
=
task
,
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
,
'task.train_data.num_classes == task.validation_data.num_classes'
,
])
add_trainer
(
config
,
train_batch_size
=
1024
,
eval_batch_size
=
64
)
return
config
@
exp_factory
.
register_config_factory
(
'video_classification_kinetics700_2020'
)
def
video_classification_kinetics700_2020
()
->
cfg
.
ExperimentConfig
:
"""Video classification on Kinectics 700 2020 with resnet."""
train_dataset
=
kinetics700_2020
(
is_training
=
True
)
validation_dataset
=
kinetics700_2020
(
is_training
=
False
)
task
=
VideoClassificationTask
(
model
=
VideoClassificationModel
(
backbone
=
backbones_3d
.
Backbone3D
(
type
=
'resnet_3d'
,
resnet_3d
=
backbones_3d
.
ResNet3D50
()),
norm_activation
=
common
.
NormActivation
(
norm_momentum
=
0.9
,
norm_epsilon
=
1e-5
,
use_sync_bn
=
False
)),
losses
=
Losses
(
l2_weight_decay
=
1e-4
),
train_data
=
train_dataset
,
validation_data
=
validation_dataset
)
config
=
cfg
.
ExperimentConfig
(
runtime
=
cfg
.
RuntimeConfig
(
mixed_precision_dtype
=
'bfloat16'
),
task
=
task
,
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
,
'task.train_data.num_classes == task.validation_data.num_classes'
,
])
add_trainer
(
config
,
train_batch_size
=
1024
,
eval_batch_size
=
64
)
return
config
official/vision/beta/configs/video_classification_test.py
deleted
100644 → 0
View file @
f47405b5
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for video_classification."""
# pylint: disable=unused-import
from
absl.testing
import
parameterized
import
tensorflow
as
tf
from
official.core
import
config_definitions
as
cfg
from
official.core
import
exp_factory
from
official.vision
import
beta
from
official.vision.beta.configs
import
video_classification
as
exp_cfg
class
VideoClassificationConfigTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
@
parameterized
.
parameters
((
'video_classification'
,),
(
'video_classification_kinetics600'
,))
def
test_video_classification_configs
(
self
,
config_name
):
config
=
exp_factory
.
get_exp_config
(
config_name
)
self
.
assertIsInstance
(
config
,
cfg
.
ExperimentConfig
)
self
.
assertIsInstance
(
config
.
task
,
exp_cfg
.
VideoClassificationTask
)
self
.
assertIsInstance
(
config
.
task
.
model
,
exp_cfg
.
VideoClassificationModel
)
self
.
assertIsInstance
(
config
.
task
.
train_data
,
exp_cfg
.
DataConfig
)
config
.
validate
()
config
.
task
.
train_data
.
is_training
=
None
with
self
.
assertRaises
(
KeyError
):
config
.
validate
()
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/beta/data/__init__.py
deleted
100644 → 0
View file @
f47405b5
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
official/vision/beta/data/create_coco_tf_record.py
deleted
100644 → 0
View file @
f47405b5
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
r
"""Convert raw COCO dataset to TFRecord format.
This scripts follows the label map decoder format and supports detection
boxes, instance masks and captions.
Example usage:
python create_coco_tf_record.py --logtostderr \
--image_dir="${TRAIN_IMAGE_DIR}" \
--image_info_file="${TRAIN_IMAGE_INFO_FILE}" \
--object_annotations_file="${TRAIN_ANNOTATIONS_FILE}" \
--caption_annotations_file="${CAPTION_ANNOTATIONS_FILE}" \
--output_file_prefix="${OUTPUT_DIR/FILE_PREFIX}" \
--num_shards=100
"""
import
collections
import
json
import
logging
import
os
from
absl
import
app
# pylint:disable=unused-import
from
absl
import
flags
import
numpy
as
np
from
pycocotools
import
mask
import
tensorflow
as
tf
import
multiprocessing
as
mp
from
official.vision.beta.data
import
tfrecord_lib
flags
.
DEFINE_boolean
(
'include_masks'
,
False
,
'Whether to include instance segmentations masks '
'(PNG encoded) in the result. default: False.'
)
flags
.
DEFINE_multi_string
(
'image_dir'
,
''
,
'Directory containing images.'
)
flags
.
DEFINE_string
(
'image_info_file'
,
''
,
'File containing image information. '
'Tf Examples in the output files correspond to the image '
'info entries in this file. If this file is not provided '
'object_annotations_file is used if present. Otherwise, '
'caption_annotations_file is used to get image info.'
)
flags
.
DEFINE_string
(
'object_annotations_file'
,
''
,
'File containing object '
'annotations - boxes and instance masks.'
)
flags
.
DEFINE_string
(
'caption_annotations_file'
,
''
,
'File containing image '
'captions.'
)
flags
.
DEFINE_string
(
'panoptic_annotations_file'
,
''
,
'File containing panoptic '
'annotations.'
)
flags
.
DEFINE_string
(
'panoptic_masks_dir'
,
''
,
'Directory containing panoptic masks annotations.'
)
flags
.
DEFINE_boolean
(
'include_panoptic_masks'
,
False
,
'Whether to include category and '
'instance masks in the result. These are required to run the PQ evaluator '
'default: False.'
)
flags
.
DEFINE_string
(
'output_file_prefix'
,
'/tmp/train'
,
'Path to output file'
)
flags
.
DEFINE_integer
(
'num_shards'
,
32
,
'Number of shards for output file.'
)
FLAGS
=
flags
.
FLAGS
logger
=
tf
.
get_logger
()
logger
.
setLevel
(
logging
.
INFO
)
_VOID_LABEL
=
0
_VOID_INSTANCE_ID
=
0
_THING_CLASS_ID
=
1
_STUFF_CLASSES_OFFSET
=
90
def
coco_segmentation_to_mask_png
(
segmentation
,
height
,
width
,
is_crowd
):
"""Encode a COCO mask segmentation as PNG string."""
run_len_encoding
=
mask
.
frPyObjects
(
segmentation
,
height
,
width
)
binary_mask
=
mask
.
decode
(
run_len_encoding
)
if
not
is_crowd
:
binary_mask
=
np
.
amax
(
binary_mask
,
axis
=
2
)
return
tfrecord_lib
.
encode_mask_as_png
(
binary_mask
)
def
generate_coco_panoptics_masks
(
segments_info
,
mask_path
,
include_panoptic_masks
,
is_category_thing
):
"""Creates masks for panoptic segmentation task.
Args:
segments_info: a list of dicts, where each dict has keys: [u'id',
u'category_id', u'area', u'bbox', u'iscrowd'], detailing information for
each segment in the panoptic mask.
mask_path: path to the panoptic mask.
include_panoptic_masks: bool, when set to True, category and instance
masks are included in the outputs. Set this to True, when using
the Panoptic Quality evaluator.
is_category_thing: a dict with category ids as keys and, 0/1 as values to
represent "stuff" and "things" classes respectively.
Returns:
A dict with with keys: [u'semantic_segmentation_mask', u'category_mask',
u'instance_mask']. The dict contains 'category_mask' and 'instance_mask'
only if `include_panoptic_eval_masks` is set to True.
"""
rgb_mask
=
tfrecord_lib
.
read_image
(
mask_path
)
r
,
g
,
b
=
np
.
split
(
rgb_mask
,
3
,
axis
=-
1
)
# decode rgb encoded panoptic mask to get segments ids
# refer https://cocodataset.org/#format-data
segments_encoded_mask
=
(
r
+
g
*
256
+
b
*
(
256
**
2
)).
squeeze
()
semantic_segmentation_mask
=
np
.
ones_like
(
segments_encoded_mask
,
dtype
=
np
.
uint8
)
*
_VOID_LABEL
if
include_panoptic_masks
:
category_mask
=
np
.
ones_like
(
segments_encoded_mask
,
dtype
=
np
.
uint8
)
*
_VOID_LABEL
instance_mask
=
np
.
ones_like
(
segments_encoded_mask
,
dtype
=
np
.
uint8
)
*
_VOID_INSTANCE_ID
for
idx
,
segment
in
enumerate
(
segments_info
):
segment_id
=
segment
[
'id'
]
category_id
=
segment
[
'category_id'
]
if
is_category_thing
[
category_id
]:
encoded_category_id
=
_THING_CLASS_ID
instance_id
=
idx
+
1
else
:
encoded_category_id
=
category_id
-
_STUFF_CLASSES_OFFSET
instance_id
=
_VOID_INSTANCE_ID
segment_mask
=
(
segments_encoded_mask
==
segment_id
)
semantic_segmentation_mask
[
segment_mask
]
=
encoded_category_id
if
include_panoptic_masks
:
category_mask
[
segment_mask
]
=
category_id
instance_mask
[
segment_mask
]
=
instance_id
outputs
=
{
'semantic_segmentation_mask'
:
tfrecord_lib
.
encode_mask_as_png
(
semantic_segmentation_mask
)
}
if
include_panoptic_masks
:
outputs
.
update
({
'category_mask'
:
tfrecord_lib
.
encode_mask_as_png
(
category_mask
),
'instance_mask'
:
tfrecord_lib
.
encode_mask_as_png
(
instance_mask
)
})
return
outputs
def
coco_annotations_to_lists
(
bbox_annotations
,
id_to_name_map
,
image_height
,
image_width
,
include_masks
):
"""Converts COCO annotations to feature lists."""
data
=
dict
((
k
,
list
())
for
k
in
[
'xmin'
,
'xmax'
,
'ymin'
,
'ymax'
,
'is_crowd'
,
'category_id'
,
'category_names'
,
'area'
])
if
include_masks
:
data
[
'encoded_mask_png'
]
=
[]
num_annotations_skipped
=
0
for
object_annotations
in
bbox_annotations
:
(
x
,
y
,
width
,
height
)
=
tuple
(
object_annotations
[
'bbox'
])
if
width
<=
0
or
height
<=
0
:
num_annotations_skipped
+=
1
continue
if
x
+
width
>
image_width
or
y
+
height
>
image_height
:
num_annotations_skipped
+=
1
continue
data
[
'xmin'
].
append
(
float
(
x
)
/
image_width
)
data
[
'xmax'
].
append
(
float
(
x
+
width
)
/
image_width
)
data
[
'ymin'
].
append
(
float
(
y
)
/
image_height
)
data
[
'ymax'
].
append
(
float
(
y
+
height
)
/
image_height
)
data
[
'is_crowd'
].
append
(
object_annotations
[
'iscrowd'
])
category_id
=
int
(
object_annotations
[
'category_id'
])
data
[
'category_id'
].
append
(
category_id
)
data
[
'category_names'
].
append
(
id_to_name_map
[
category_id
].
encode
(
'utf8'
))
data
[
'area'
].
append
(
object_annotations
[
'area'
])
if
include_masks
:
data
[
'encoded_mask_png'
].
append
(
coco_segmentation_to_mask_png
(
object_annotations
[
'segmentation'
],
image_height
,
image_width
,
object_annotations
[
'iscrowd'
])
)
return
data
,
num_annotations_skipped
def
bbox_annotations_to_feature_dict
(
bbox_annotations
,
image_height
,
image_width
,
id_to_name_map
,
include_masks
):
"""Convert COCO annotations to an encoded feature dict."""
data
,
num_skipped
=
coco_annotations_to_lists
(
bbox_annotations
,
id_to_name_map
,
image_height
,
image_width
,
include_masks
)
feature_dict
=
{
'image/object/bbox/xmin'
:
tfrecord_lib
.
convert_to_feature
(
data
[
'xmin'
]),
'image/object/bbox/xmax'
:
tfrecord_lib
.
convert_to_feature
(
data
[
'xmax'
]),
'image/object/bbox/ymin'
:
tfrecord_lib
.
convert_to_feature
(
data
[
'ymin'
]),
'image/object/bbox/ymax'
:
tfrecord_lib
.
convert_to_feature
(
data
[
'ymax'
]),
'image/object/class/text'
:
tfrecord_lib
.
convert_to_feature
(
data
[
'category_names'
]),
'image/object/class/label'
:
tfrecord_lib
.
convert_to_feature
(
data
[
'category_id'
]),
'image/object/is_crowd'
:
tfrecord_lib
.
convert_to_feature
(
data
[
'is_crowd'
]),
'image/object/area'
:
tfrecord_lib
.
convert_to_feature
(
data
[
'area'
]),
}
if
include_masks
:
feature_dict
[
'image/object/mask'
]
=
(
tfrecord_lib
.
convert_to_feature
(
data
[
'encoded_mask_png'
]))
return
feature_dict
,
num_skipped
def
encode_caption_annotations
(
caption_annotations
):
captions
=
[]
for
caption_annotation
in
caption_annotations
:
captions
.
append
(
caption_annotation
[
'caption'
].
encode
(
'utf8'
))
return
captions
def
create_tf_example
(
image
,
image_dirs
,
panoptic_masks_dir
=
None
,
bbox_annotations
=
None
,
id_to_name_map
=
None
,
caption_annotations
=
None
,
panoptic_annotation
=
None
,
is_category_thing
=
None
,
include_panoptic_masks
=
False
,
include_masks
=
False
):
"""Converts image and annotations to a tf.Example proto.
Args:
image: dict with keys: [u'license', u'file_name', u'coco_url', u'height',
u'width', u'date_captured', u'flickr_url', u'id']
image_dirs: list of directories containing the image files.
panoptic_masks_dir: `str` of the panoptic masks directory.
bbox_annotations:
list of dicts with keys: [u'segmentation', u'area', u'iscrowd',
u'image_id', u'bbox', u'category_id', u'id'] Notice that bounding box
coordinates in the official COCO dataset are given as [x, y, width,
height] tuples using absolute coordinates where x, y represent the
top-left (0-indexed) corner. This function converts to the format
expected by the Tensorflow Object Detection API (which is which is
[ymin, xmin, ymax, xmax] with coordinates normalized relative to image
size).
id_to_name_map: a dict mapping category IDs to string names.
caption_annotations:
list of dict with keys: [u'id', u'image_id', u'str'].
panoptic_annotation: dict with keys: [u'image_id', u'file_name',
u'segments_info']. Where the value for segments_info is a list of dicts,
with each dict containing information for a single segment in the mask.
is_category_thing: `bool`, whether it is a category thing.
include_panoptic_masks: `bool`, whether to include panoptic masks.
include_masks: Whether to include instance segmentations masks
(PNG encoded) in the result. default: False.
Returns:
example: The converted tf.Example
num_annotations_skipped: Number of (invalid) annotations that were ignored.
Raises:
ValueError: if the image pointed to by data['filename'] is not a valid JPEG,
does not exist, or is not unique across image directories.
"""
image_height
=
image
[
'height'
]
image_width
=
image
[
'width'
]
filename
=
image
[
'file_name'
]
image_id
=
image
[
'id'
]
if
len
(
image_dirs
)
>
1
:
full_paths
=
[
os
.
path
.
join
(
image_dir
,
filename
)
for
image_dir
in
image_dirs
]
full_existing_paths
=
[
p
for
p
in
full_paths
if
tf
.
io
.
gfile
.
exists
(
p
)]
if
not
full_existing_paths
:
raise
ValueError
(
'{} does not exist across image directories.'
.
format
(
filename
))
if
len
(
full_existing_paths
)
>
1
:
raise
ValueError
(
'{} is not unique across image directories'
.
format
(
filename
))
full_path
,
=
full_existing_paths
# If there is only one image directory, it's not worth checking for existence,
# since trying to open the file will raise an informative error message if it
# does not exist.
else
:
image_dir
,
=
image_dirs
full_path
=
os
.
path
.
join
(
image_dir
,
filename
)
with
tf
.
io
.
gfile
.
GFile
(
full_path
,
'rb'
)
as
fid
:
encoded_jpg
=
fid
.
read
()
feature_dict
=
tfrecord_lib
.
image_info_to_feature_dict
(
image_height
,
image_width
,
filename
,
image_id
,
encoded_jpg
,
'jpg'
)
num_annotations_skipped
=
0
if
bbox_annotations
:
box_feature_dict
,
num_skipped
=
bbox_annotations_to_feature_dict
(
bbox_annotations
,
image_height
,
image_width
,
id_to_name_map
,
include_masks
)
num_annotations_skipped
+=
num_skipped
feature_dict
.
update
(
box_feature_dict
)
if
caption_annotations
:
encoded_captions
=
encode_caption_annotations
(
caption_annotations
)
feature_dict
.
update
(
{
'image/caption'
:
tfrecord_lib
.
convert_to_feature
(
encoded_captions
)})
if
panoptic_annotation
:
segments_info
=
panoptic_annotation
[
'segments_info'
]
panoptic_mask_filename
=
os
.
path
.
join
(
panoptic_masks_dir
,
panoptic_annotation
[
'file_name'
])
encoded_panoptic_masks
=
generate_coco_panoptics_masks
(
segments_info
,
panoptic_mask_filename
,
include_panoptic_masks
,
is_category_thing
)
feature_dict
.
update
(
{
'image/segmentation/class/encoded'
:
tfrecord_lib
.
convert_to_feature
(
encoded_panoptic_masks
[
'semantic_segmentation_mask'
])})
if
include_panoptic_masks
:
feature_dict
.
update
({
'image/panoptic/category_mask'
:
tfrecord_lib
.
convert_to_feature
(
encoded_panoptic_masks
[
'category_mask'
]),
'image/panoptic/instance_mask'
:
tfrecord_lib
.
convert_to_feature
(
encoded_panoptic_masks
[
'instance_mask'
])
})
example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
feature_dict
))
return
example
,
num_annotations_skipped
def
_load_object_annotations
(
object_annotations_file
):
"""Loads object annotation JSON file."""
with
tf
.
io
.
gfile
.
GFile
(
object_annotations_file
,
'r'
)
as
fid
:
obj_annotations
=
json
.
load
(
fid
)
images
=
obj_annotations
[
'images'
]
id_to_name_map
=
dict
((
element
[
'id'
],
element
[
'name'
])
for
element
in
obj_annotations
[
'categories'
])
img_to_obj_annotation
=
collections
.
defaultdict
(
list
)
logging
.
info
(
'Building bounding box index.'
)
for
annotation
in
obj_annotations
[
'annotations'
]:
image_id
=
annotation
[
'image_id'
]
img_to_obj_annotation
[
image_id
].
append
(
annotation
)
missing_annotation_count
=
0
for
image
in
images
:
image_id
=
image
[
'id'
]
if
image_id
not
in
img_to_obj_annotation
:
missing_annotation_count
+=
1
logging
.
info
(
'%d images are missing bboxes.'
,
missing_annotation_count
)
return
img_to_obj_annotation
,
id_to_name_map
def
_load_caption_annotations
(
caption_annotations_file
):
"""Loads caption annotation JSON file."""
with
tf
.
io
.
gfile
.
GFile
(
caption_annotations_file
,
'r'
)
as
fid
:
caption_annotations
=
json
.
load
(
fid
)
img_to_caption_annotation
=
collections
.
defaultdict
(
list
)
logging
.
info
(
'Building caption index.'
)
for
annotation
in
caption_annotations
[
'annotations'
]:
image_id
=
annotation
[
'image_id'
]
img_to_caption_annotation
[
image_id
].
append
(
annotation
)
missing_annotation_count
=
0
images
=
caption_annotations
[
'images'
]
for
image
in
images
:
image_id
=
image
[
'id'
]
if
image_id
not
in
img_to_caption_annotation
:
missing_annotation_count
+=
1
logging
.
info
(
'%d images are missing captions.'
,
missing_annotation_count
)
return
img_to_caption_annotation
def
_load_panoptic_annotations
(
panoptic_annotations_file
):
"""Loads panoptic annotation from file."""
with
tf
.
io
.
gfile
.
GFile
(
panoptic_annotations_file
,
'r'
)
as
fid
:
panoptic_annotations
=
json
.
load
(
fid
)
img_to_panoptic_annotation
=
dict
()
logging
.
info
(
'Building panoptic index.'
)
for
annotation
in
panoptic_annotations
[
'annotations'
]:
image_id
=
annotation
[
'image_id'
]
img_to_panoptic_annotation
[
image_id
]
=
annotation
is_category_thing
=
dict
()
for
category_info
in
panoptic_annotations
[
'categories'
]:
is_category_thing
[
category_info
[
'id'
]]
=
category_info
[
'isthing'
]
==
1
missing_annotation_count
=
0
images
=
panoptic_annotations
[
'images'
]
for
image
in
images
:
image_id
=
image
[
'id'
]
if
image_id
not
in
img_to_panoptic_annotation
:
missing_annotation_count
+=
1
logging
.
info
(
'%d images are missing panoptic annotations.'
,
missing_annotation_count
)
return
img_to_panoptic_annotation
,
is_category_thing
def
_load_images_info
(
images_info_file
):
with
tf
.
io
.
gfile
.
GFile
(
images_info_file
,
'r'
)
as
fid
:
info_dict
=
json
.
load
(
fid
)
return
info_dict
[
'images'
]
def
generate_annotations
(
images
,
image_dirs
,
panoptic_masks_dir
=
None
,
img_to_obj_annotation
=
None
,
img_to_caption_annotation
=
None
,
img_to_panoptic_annotation
=
None
,
is_category_thing
=
None
,
id_to_name_map
=
None
,
include_panoptic_masks
=
False
,
include_masks
=
False
):
"""Generator for COCO annotations."""
for
image
in
images
:
object_annotation
=
(
img_to_obj_annotation
.
get
(
image
[
'id'
],
None
)
if
img_to_obj_annotation
else
None
)
caption_annotaion
=
(
img_to_caption_annotation
.
get
(
image
[
'id'
],
None
)
if
img_to_caption_annotation
else
None
)
panoptic_annotation
=
(
img_to_panoptic_annotation
.
get
(
image
[
'id'
],
None
)
if
img_to_panoptic_annotation
else
None
)
yield
(
image
,
image_dirs
,
panoptic_masks_dir
,
object_annotation
,
id_to_name_map
,
caption_annotaion
,
panoptic_annotation
,
is_category_thing
,
include_panoptic_masks
,
include_masks
)
def
_create_tf_record_from_coco_annotations
(
images_info_file
,
image_dirs
,
output_path
,
num_shards
,
object_annotations_file
=
None
,
caption_annotations_file
=
None
,
panoptic_masks_dir
=
None
,
panoptic_annotations_file
=
None
,
include_panoptic_masks
=
False
,
include_masks
=
False
):
"""Loads COCO annotation json files and converts to tf.Record format.
Args:
images_info_file: JSON file containing image info. The number of tf.Examples
in the output tf Record files is exactly equal to the number of image info
entries in this file. This can be any of train/val/test annotation json
files Eg. 'image_info_test-dev2017.json',
'instance_annotations_train2017.json',
'caption_annotations_train2017.json', etc.
image_dirs: List of directories containing the image files.
output_path: Path to output tf.Record file.
num_shards: Number of output files to create.
object_annotations_file: JSON file containing bounding box annotations.
caption_annotations_file: JSON file containing caption annotations.
panoptic_masks_dir: Directory containing panoptic masks.
panoptic_annotations_file: JSON file containing panoptic annotations.
include_panoptic_masks: Whether to include 'category_mask'
and 'instance_mask', which is required by the panoptic quality evaluator.
include_masks: Whether to include instance segmentations masks
(PNG encoded) in the result. default: False.
"""
logging
.
info
(
'writing to output path: %s'
,
output_path
)
images
=
_load_images_info
(
images_info_file
)
img_to_obj_annotation
=
None
img_to_caption_annotation
=
None
id_to_name_map
=
None
img_to_panoptic_annotation
=
None
is_category_thing
=
None
if
object_annotations_file
:
img_to_obj_annotation
,
id_to_name_map
=
(
_load_object_annotations
(
object_annotations_file
))
if
caption_annotations_file
:
img_to_caption_annotation
=
(
_load_caption_annotations
(
caption_annotations_file
))
if
panoptic_annotations_file
:
img_to_panoptic_annotation
,
is_category_thing
=
(
_load_panoptic_annotations
(
panoptic_annotations_file
))
coco_annotations_iter
=
generate_annotations
(
images
=
images
,
image_dirs
=
image_dirs
,
panoptic_masks_dir
=
panoptic_masks_dir
,
img_to_obj_annotation
=
img_to_obj_annotation
,
img_to_caption_annotation
=
img_to_caption_annotation
,
img_to_panoptic_annotation
=
img_to_panoptic_annotation
,
is_category_thing
=
is_category_thing
,
id_to_name_map
=
id_to_name_map
,
include_panoptic_masks
=
include_panoptic_masks
,
include_masks
=
include_masks
)
num_skipped
=
tfrecord_lib
.
write_tf_record_dataset
(
output_path
,
coco_annotations_iter
,
create_tf_example
,
num_shards
)
logging
.
info
(
'Finished writing, skipped %d annotations.'
,
num_skipped
)
def
main
(
_
):
assert
FLAGS
.
image_dir
,
'`image_dir` missing.'
assert
(
FLAGS
.
image_info_file
or
FLAGS
.
object_annotations_file
or
FLAGS
.
caption_annotations_file
),
(
'All annotation files are '
'missing.'
)
if
FLAGS
.
image_info_file
:
images_info_file
=
FLAGS
.
image_info_file
elif
FLAGS
.
object_annotations_file
:
images_info_file
=
FLAGS
.
object_annotations_file
else
:
images_info_file
=
FLAGS
.
caption_annotations_file
directory
=
os
.
path
.
dirname
(
FLAGS
.
output_file_prefix
)
if
not
tf
.
io
.
gfile
.
isdir
(
directory
):
tf
.
io
.
gfile
.
makedirs
(
directory
)
_create_tf_record_from_coco_annotations
(
images_info_file
,
FLAGS
.
image_dir
,
FLAGS
.
output_file_prefix
,
FLAGS
.
num_shards
,
FLAGS
.
object_annotations_file
,
FLAGS
.
caption_annotations_file
,
FLAGS
.
panoptic_masks_dir
,
FLAGS
.
panoptic_annotations_file
,
FLAGS
.
include_panoptic_masks
,
FLAGS
.
include_masks
)
if
__name__
==
'__main__'
:
app
.
run
(
main
)
official/vision/beta/data/process_coco_few_shot.sh
deleted
100644 → 0
View file @
f47405b5
#!/bin/bash
#
# Processes the COCO few-shot benchmark into TFRecord files. Requires `wget`.
tmp_dir
=
$(
mktemp
-d
-t
coco-XXXXXXXXXX
)
base_image_dir
=
"/tmp/coco_images"
output_dir
=
"/tmp/coco_few_shot"
while
getopts
":i:o:"
o
;
do
case
"
${
o
}
"
in
o
)
output_dir
=
${
OPTARG
}
;;
i
)
base_image_dir
=
${
OPTARG
}
;;
*
)
echo
"Usage:
${
0
}
[-i <base_image_dir>] [-o <output_dir>]"
1>&2
;
exit
1
;;
esac
done
cocosplit_url
=
"dl.yf.io/fs-det/datasets/cocosplit"
wget
--recursive
--no-parent
-q
--show-progress
--progress
=
bar:force:noscroll
\
-P
"
${
tmp_dir
}
"
-A
"trainvalno5k.json,5k.json,*1shot*.json,*3shot*.json,*5shot*.json,*10shot*.json,*30shot*.json"
\
"http://
${
cocosplit_url
}
/"
mv
"
${
tmp_dir
}
/
${
cocosplit_url
}
/"
*
"
${
tmp_dir
}
"
rm
-rf
"
${
tmp_dir
}
/
${
cocosplit_url
}
/"
python process_coco_few_shot_json_files.py
\
--logtostderr
--workdir
=
"
${
tmp_dir
}
"
for
seed
in
{
0..9
}
;
do
for
shots
in
1 3 5 10 30
;
do
python create_coco_tf_record.py
\
--logtostderr
\
--image_dir
=
"
${
base_image_dir
}
/train2014"
\
--image_dir
=
"
${
base_image_dir
}
/val2014"
\
--image_info_file
=
"
${
tmp_dir
}
/
${
shots
}
shot_seed
${
seed
}
.json"
\
--object_annotations_file
=
"
${
tmp_dir
}
/
${
shots
}
shot_seed
${
seed
}
.json"
\
--caption_annotations_file
=
""
\
--output_file_prefix
=
"
${
output_dir
}
/
${
shots
}
shot_seed
${
seed
}
"
\
--num_shards
=
4
done
done
python create_coco_tf_record.py
\
--logtostderr
\
--image_dir
=
"
${
base_image_dir
}
/train2014"
\
--image_dir
=
"
${
base_image_dir
}
/val2014"
\
--image_info_file
=
"
${
tmp_dir
}
/datasplit/5k.json"
\
--object_annotations_file
=
"
${
tmp_dir
}
/datasplit/5k.json"
\
--caption_annotations_file
=
""
\
--output_file_prefix
=
"
${
output_dir
}
/5k"
\
--num_shards
=
10
python create_coco_tf_record.py
\
--logtostderr
\
--image_dir
=
"
${
base_image_dir
}
/train2014"
\
--image_dir
=
"
${
base_image_dir
}
/val2014"
\
--image_info_file
=
"
${
tmp_dir
}
/datasplit/trainvalno5k_base.json"
\
--object_annotations_file
=
"
${
tmp_dir
}
/datasplit/trainvalno5k_base.json"
\
--caption_annotations_file
=
""
\
--output_file_prefix
=
"
${
output_dir
}
/trainvalno5k_base"
\
--num_shards
=
200
python create_coco_tf_record.py
\
--logtostderr
\
--image_dir
=
"
${
base_image_dir
}
/train2014"
\
--image_dir
=
"
${
base_image_dir
}
/val2014"
\
--image_info_file
=
"
${
tmp_dir
}
/datasplit/5k_base.json"
\
--object_annotations_file
=
"
${
tmp_dir
}
/datasplit/5k_base.json"
\
--caption_annotations_file
=
""
\
--output_file_prefix
=
"
${
output_dir
}
/5k_base"
\
--num_shards
=
10
rm
-rf
"
${
tmp_dir
}
"
official/vision/beta/data/process_coco_few_shot_json_files.py
deleted
100644 → 0
View file @
f47405b5
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Processes the JSON files for COCO few-shot.
We assume that `workdir` mirrors the contents of
http://dl.yf.io/fs-det/datasets/cocosplit/, which contains the official JSON
files for the few-shot COCO evaluation procedure that Wang et al. (2020)'s
"Frustratingly Simple Few-Shot Object Detection" paper uses.
"""
import
collections
import
itertools
import
json
import
logging
import
os
from
absl
import
app
from
absl
import
flags
import
tensorflow
as
tf
logger
=
tf
.
get_logger
()
logger
.
setLevel
(
logging
.
INFO
)
flags
.
DEFINE_string
(
'workdir'
,
None
,
'Working directory.'
)
FLAGS
=
flags
.
FLAGS
CATEGORIES
=
[
'airplane'
,
'apple'
,
'backpack'
,
'banana'
,
'baseball bat'
,
'baseball glove'
,
'bear'
,
'bed'
,
'bench'
,
'bicycle'
,
'bird'
,
'boat'
,
'book'
,
'bottle'
,
'bowl'
,
'broccoli'
,
'bus'
,
'cake'
,
'car'
,
'carrot'
,
'cat'
,
'cell phone'
,
'chair'
,
'clock'
,
'couch'
,
'cow'
,
'cup'
,
'dining table'
,
'dog'
,
'donut'
,
'elephant'
,
'fire hydrant'
,
'fork'
,
'frisbee'
,
'giraffe'
,
'hair drier'
,
'handbag'
,
'horse'
,
'hot dog'
,
'keyboard'
,
'kite'
,
'knife'
,
'laptop'
,
'microwave'
,
'motorcycle'
,
'mouse'
,
'orange'
,
'oven'
,
'parking meter'
,
'person'
,
'pizza'
,
'potted plant'
,
'refrigerator'
,
'remote'
,
'sandwich'
,
'scissors'
,
'sheep'
,
'sink'
,
'skateboard'
,
'skis'
,
'snowboard'
,
'spoon'
,
'sports ball'
,
'stop sign'
,
'suitcase'
,
'surfboard'
,
'teddy bear'
,
'tennis racket'
,
'tie'
,
'toaster'
,
'toilet'
,
'toothbrush'
,
'traffic light'
,
'train'
,
'truck'
,
'tv'
,
'umbrella'
,
'vase'
,
'wine glass'
,
'zebra'
]
SEEDS
=
list
(
range
(
10
))
SHOTS
=
[
1
,
3
,
5
,
10
,
30
]
FILE_SUFFIXES
=
collections
.
defaultdict
(
list
)
for
_seed
,
_shots
in
itertools
.
product
(
SEEDS
,
SHOTS
):
for
_category
in
CATEGORIES
:
FILE_SUFFIXES
[(
_seed
,
_shots
)].
append
(
'{}full_box_{}shot_{}_trainval.json'
.
format
(
# http://dl.yf.io/fs-det/datasets/cocosplit/ is organized like so:
#
# datasplit/
# trainvalno5k.json
# 5k.json
# full_box_{1,2,3,5,10,30}shot_{category}_trainval.json
# seed{1-9}/
# full_box_{1,2,3,5,10,30}shot_{category}_trainval.json
#
# This means that the JSON files for seed0 are located in the root
# directory rather than in a `seed?/` subdirectory, hence the
# conditional expression below.
''
if
_seed
==
0
else
'seed{}/'
.
format
(
_seed
),
_shots
,
_category
))
# Base class IDs, as defined in
# https://github.com/ucbdrive/few-shot-object-detection/blob/master/fsdet/evaluation/coco_evaluation.py#L60-L65
BASE_CLASS_IDS
=
[
8
,
10
,
11
,
13
,
14
,
15
,
22
,
23
,
24
,
25
,
27
,
28
,
31
,
32
,
33
,
34
,
35
,
36
,
37
,
38
,
39
,
40
,
41
,
42
,
43
,
46
,
47
,
48
,
49
,
50
,
51
,
52
,
53
,
54
,
55
,
56
,
57
,
58
,
59
,
60
,
61
,
65
,
70
,
73
,
74
,
75
,
76
,
77
,
78
,
79
,
80
,
81
,
82
,
84
,
85
,
86
,
87
,
88
,
89
,
90
]
def
main
(
unused_argv
):
workdir
=
FLAGS
.
workdir
# Filter novel class annotations from the training and validation sets.
for
name
in
(
'trainvalno5k'
,
'5k'
):
file_path
=
os
.
path
.
join
(
workdir
,
'datasplit'
,
'{}.json'
.
format
(
name
))
with
tf
.
io
.
gfile
.
GFile
(
file_path
,
'r'
)
as
f
:
json_dict
=
json
.
load
(
f
)
json_dict
[
'annotations'
]
=
[
a
for
a
in
json_dict
[
'annotations'
]
if
a
[
'category_id'
]
in
BASE_CLASS_IDS
]
output_path
=
os
.
path
.
join
(
workdir
,
'datasplit'
,
'{}_base.json'
.
format
(
name
))
with
tf
.
io
.
gfile
.
GFile
(
output_path
,
'w'
)
as
f
:
json
.
dump
(
json_dict
,
f
)
for
seed
,
shots
in
itertools
.
product
(
SEEDS
,
SHOTS
):
# Retrieve all examples for a given seed and shots setting.
file_paths
=
[
os
.
path
.
join
(
workdir
,
suffix
)
for
suffix
in
FILE_SUFFIXES
[(
seed
,
shots
)]]
json_dicts
=
[]
for
file_path
in
file_paths
:
with
tf
.
io
.
gfile
.
GFile
(
file_path
,
'r'
)
as
f
:
json_dicts
.
append
(
json
.
load
(
f
))
# Make sure that all JSON files for a given seed and shots setting have the
# same metadata. We count on this to fuse them later on.
metadata_dicts
=
[{
'info'
:
d
[
'info'
],
'licenses'
:
d
[
'licenses'
],
'categories'
:
d
[
'categories'
]}
for
d
in
json_dicts
]
if
not
all
(
d
==
metadata_dicts
[
0
]
for
d
in
metadata_dicts
[
1
:]):
raise
RuntimeError
(
'JSON files for {} shots (seed {}) '
.
format
(
shots
,
seed
)
+
'have different info, licences, or categories fields'
)
# Retrieve images across all JSON files.
images
=
sum
((
d
[
'images'
]
for
d
in
json_dicts
),
[])
# Remove duplicate image entries.
images
=
list
({
image
[
'id'
]:
image
for
image
in
images
}.
values
())
output_dict
=
{
'info'
:
json_dicts
[
0
][
'info'
],
'licenses'
:
json_dicts
[
0
][
'licenses'
],
'categories'
:
json_dicts
[
0
][
'categories'
],
'images'
:
images
,
'annotations'
:
sum
((
d
[
'annotations'
]
for
d
in
json_dicts
),
[])
}
output_path
=
os
.
path
.
join
(
workdir
,
'{}shot_seed{}.json'
.
format
(
shots
,
seed
))
with
tf
.
io
.
gfile
.
GFile
(
output_path
,
'w'
)
as
f
:
json
.
dump
(
output_dict
,
f
)
logger
.
info
(
'Processed %d shots (seed %d) and saved to %s'
,
shots
,
seed
,
output_path
)
if
__name__
==
'__main__'
:
flags
.
mark_flag_as_required
(
'workdir'
)
app
.
run
(
main
)
official/vision/beta/data/process_coco_panoptic.sh
deleted
100644 → 0
View file @
f47405b5
#!/bin/bash
sudo
apt update
sudo
apt
install
unzip aria2
-y
DATA_DIR
=
$1
aria2c
-j
8
-Z
\
http://images.cocodataset.org/annotations/annotations_trainval2017.zip
\
http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip
\
http://images.cocodataset.org/zips/train2017.zip
\
http://images.cocodataset.org/zips/val2017.zip
\
--dir
=
$DATA_DIR
;
unzip
$DATA_DIR
/
"*"
.zip
-d
$DATA_DIR
;
mkdir
$DATA_DIR
/zips
&&
mv
$DATA_DIR
/
*
.zip
$DATA_DIR
/zips
;
unzip
$DATA_DIR
/annotations/panoptic_train2017.zip
-d
$DATA_DIR
unzip
$DATA_DIR
/annotations/panoptic_val2017.zip
-d
$DATA_DIR
python3 official/vision/beta/data/create_coco_tf_record.py
\
--logtostderr
\
--image_dir
=
"
$DATA_DIR
/val2017"
\
--object_annotations_file
=
"
$DATA_DIR
/annotations/instances_val2017.json"
\
--output_file_prefix
=
"
$DATA_DIR
/tfrecords/val"
\
--panoptic_annotations_file
=
"
$DATA_DIR
/annotations/panoptic_val2017.json"
\
--panoptic_masks_dir
=
"
$DATA_DIR
/panoptic_val2017"
\
--num_shards
=
8
\
--include_masks
\
--include_panoptic_masks
python3 official/vision/beta/data/create_coco_tf_record.py
\
--logtostderr
\
--image_dir
=
"
$DATA_DIR
/train2017"
\
--object_annotations_file
=
"
$DATA_DIR
/annotations/instances_train2017.json"
\
--output_file_prefix
=
"
$DATA_DIR
/tfrecords/train"
\
--panoptic_annotations_file
=
"
$DATA_DIR
/annotations/panoptic_train2017.json"
\
--panoptic_masks_dir
=
"
$DATA_DIR
/panoptic_train2017"
\
--num_shards
=
32
\
--include_masks
\
--include_panoptic_masks
official/vision/beta/data/tfrecord_lib.py
deleted
100644 → 0
View file @
f47405b5
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Helper functions for creating TFRecord datasets."""
import
hashlib
import
io
import
itertools
from
absl
import
logging
import
numpy
as
np
from
PIL
import
Image
import
tensorflow
as
tf
import
multiprocessing
as
mp
def
convert_to_feature
(
value
,
value_type
=
None
):
"""Converts the given python object to a tf.train.Feature.
Args:
value: int, float, bytes or a list of them.
value_type: optional, if specified, forces the feature to be of the given
type. Otherwise, type is inferred automatically. Can be one of
['bytes', 'int64', 'float', 'bytes_list', 'int64_list', 'float_list']
Returns:
feature: A tf.train.Feature object.
"""
if
value_type
is
None
:
element
=
value
[
0
]
if
isinstance
(
value
,
list
)
else
value
if
isinstance
(
element
,
bytes
):
value_type
=
'bytes'
elif
isinstance
(
element
,
(
int
,
np
.
integer
)):
value_type
=
'int64'
elif
isinstance
(
element
,
(
float
,
np
.
floating
)):
value_type
=
'float'
else
:
raise
ValueError
(
'Cannot convert type {} to feature'
.
format
(
type
(
element
)))
if
isinstance
(
value
,
list
):
value_type
=
value_type
+
'_list'
if
value_type
==
'int64'
:
return
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
[
value
]))
elif
value_type
==
'int64_list'
:
value
=
np
.
asarray
(
value
).
astype
(
np
.
int64
).
reshape
(
-
1
)
return
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
value
))
elif
value_type
==
'float'
:
return
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
[
value
]))
elif
value_type
==
'float_list'
:
value
=
np
.
asarray
(
value
).
astype
(
np
.
float32
).
reshape
(
-
1
)
return
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
value
))
elif
value_type
==
'bytes'
:
return
tf
.
train
.
Feature
(
bytes_list
=
tf
.
train
.
BytesList
(
value
=
[
value
]))
elif
value_type
==
'bytes_list'
:
return
tf
.
train
.
Feature
(
bytes_list
=
tf
.
train
.
BytesList
(
value
=
value
))
else
:
raise
ValueError
(
'Unknown value_type parameter - {}'
.
format
(
value_type
))
def
image_info_to_feature_dict
(
height
,
width
,
filename
,
image_id
,
encoded_str
,
encoded_format
):
"""Convert image information to a dict of features."""
key
=
hashlib
.
sha256
(
encoded_str
).
hexdigest
()
return
{
'image/height'
:
convert_to_feature
(
height
),
'image/width'
:
convert_to_feature
(
width
),
'image/filename'
:
convert_to_feature
(
filename
.
encode
(
'utf8'
)),
'image/source_id'
:
convert_to_feature
(
str
(
image_id
).
encode
(
'utf8'
)),
'image/key/sha256'
:
convert_to_feature
(
key
.
encode
(
'utf8'
)),
'image/encoded'
:
convert_to_feature
(
encoded_str
),
'image/format'
:
convert_to_feature
(
encoded_format
.
encode
(
'utf8'
)),
}
def
read_image
(
image_path
):
pil_image
=
Image
.
open
(
image_path
)
return
np
.
asarray
(
pil_image
)
def
encode_mask_as_png
(
mask
):
pil_image
=
Image
.
fromarray
(
mask
)
output_io
=
io
.
BytesIO
()
pil_image
.
save
(
output_io
,
format
=
'PNG'
)
return
output_io
.
getvalue
()
def
write_tf_record_dataset
(
output_path
,
annotation_iterator
,
process_func
,
num_shards
,
use_multiprocessing
=
True
,
unpack_arguments
=
True
):
"""Iterates over annotations, processes them and writes into TFRecords.
Args:
output_path: The prefix path to create TF record files.
annotation_iterator: An iterator of tuples containing details about the
dataset.
process_func: A function which takes the elements from the tuples of
annotation_iterator as arguments and returns a tuple of (tf.train.Example,
int). The integer indicates the number of annotations that were skipped.
num_shards: int, the number of shards to write for the dataset.
use_multiprocessing:
Whether or not to use multiple processes to write TF Records.
unpack_arguments:
Whether to unpack the tuples from annotation_iterator as individual
arguments to the process func or to pass the returned value as it is.
Returns:
num_skipped: The total number of skipped annotations.
"""
writers
=
[
tf
.
io
.
TFRecordWriter
(
output_path
+
'-%05d-of-%05d.tfrecord'
%
(
i
,
num_shards
))
for
i
in
range
(
num_shards
)
]
total_num_annotations_skipped
=
0
if
use_multiprocessing
:
pool
=
mp
.
Pool
()
if
unpack_arguments
:
tf_example_iterator
=
pool
.
starmap
(
process_func
,
annotation_iterator
)
else
:
tf_example_iterator
=
pool
.
imap
(
process_func
,
annotation_iterator
)
else
:
if
unpack_arguments
:
tf_example_iterator
=
itertools
.
starmap
(
process_func
,
annotation_iterator
)
else
:
tf_example_iterator
=
map
(
process_func
,
annotation_iterator
)
for
idx
,
(
tf_example
,
num_annotations_skipped
)
in
enumerate
(
tf_example_iterator
):
if
idx
%
100
==
0
:
logging
.
info
(
'On image %d'
,
idx
)
total_num_annotations_skipped
+=
num_annotations_skipped
writers
[
idx
%
num_shards
].
write
(
tf_example
.
SerializeToString
())
if
use_multiprocessing
:
pool
.
close
()
pool
.
join
()
for
writer
in
writers
:
writer
.
close
()
logging
.
info
(
'Finished writing, skipped %d annotations.'
,
total_num_annotations_skipped
)
return
total_num_annotations_skipped
def
check_and_make_dir
(
directory
):
"""Creates the directory if it doesn't exist."""
if
not
tf
.
io
.
gfile
.
isdir
(
directory
):
tf
.
io
.
gfile
.
makedirs
(
directory
)
official/vision/beta/data/tfrecord_lib_test.py
deleted
100644 → 0
View file @
f47405b5
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for tfrecord_lib."""
import
os
from
absl
import
flags
from
absl.testing
import
parameterized
import
tensorflow
as
tf
from
official.vision.beta.data
import
tfrecord_lib
FLAGS
=
flags
.
FLAGS
def
process_sample
(
x
):
d
=
{
'x'
:
x
}
return
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
d
)),
0
def
parse_function
(
example_proto
):
feature_description
=
{
'x'
:
tf
.
io
.
FixedLenFeature
([],
tf
.
int64
,
default_value
=-
1
)
}
return
tf
.
io
.
parse_single_example
(
example_proto
,
feature_description
)
class
TfrecordLibTest
(
parameterized
.
TestCase
):
def
test_write_tf_record_dataset
(
self
):
data
=
[(
tfrecord_lib
.
convert_to_feature
(
i
),)
for
i
in
range
(
17
)]
path
=
os
.
path
.
join
(
FLAGS
.
test_tmpdir
,
'train'
)
tfrecord_lib
.
write_tf_record_dataset
(
path
,
data
,
process_sample
,
3
,
use_multiprocessing
=
False
)
tfrecord_files
=
tf
.
io
.
gfile
.
glob
(
path
+
'*'
)
self
.
assertLen
(
tfrecord_files
,
3
)
dataset
=
tf
.
data
.
TFRecordDataset
(
tfrecord_files
)
dataset
=
dataset
.
map
(
parse_function
)
read_values
=
set
(
d
[
'x'
]
for
d
in
dataset
.
as_numpy_iterator
())
self
.
assertSetEqual
(
read_values
,
set
(
range
(
17
)))
def
test_convert_to_feature_float
(
self
):
proto
=
tfrecord_lib
.
convert_to_feature
(
0.0
)
self
.
assertEqual
(
proto
.
float_list
.
value
[
0
],
0.0
)
def
test_convert_to_feature_int
(
self
):
proto
=
tfrecord_lib
.
convert_to_feature
(
0
)
self
.
assertEqual
(
proto
.
int64_list
.
value
[
0
],
0
)
def
test_convert_to_feature_bytes
(
self
):
proto
=
tfrecord_lib
.
convert_to_feature
(
b
'123'
)
self
.
assertEqual
(
proto
.
bytes_list
.
value
[
0
],
b
'123'
)
def
test_convert_to_feature_float_list
(
self
):
proto
=
tfrecord_lib
.
convert_to_feature
([
0.0
,
1.0
])
self
.
assertSequenceAlmostEqual
(
proto
.
float_list
.
value
,
[
0.0
,
1.0
])
def
test_convert_to_feature_int_list
(
self
):
proto
=
tfrecord_lib
.
convert_to_feature
([
0
,
1
])
self
.
assertSequenceAlmostEqual
(
proto
.
int64_list
.
value
,
[
0
,
1
])
def
test_convert_to_feature_bytes_list
(
self
):
proto
=
tfrecord_lib
.
convert_to_feature
([
b
'123'
,
b
'456'
])
self
.
assertSequenceAlmostEqual
(
proto
.
bytes_list
.
value
,
[
b
'123'
,
b
'456'
])
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/beta/dataloaders/__init__.py
deleted
100644 → 0
View file @
f47405b5
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
official/vision/beta/dataloaders/classification_input.py
deleted
100644 → 0
View file @
f47405b5
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Classification decoder and parser."""
from
typing
import
Any
,
Dict
,
List
,
Optional
# Import libraries
import
tensorflow
as
tf
from
official.vision.beta.configs
import
common
from
official.vision.beta.dataloaders
import
decoder
from
official.vision.beta.dataloaders
import
parser
from
official.vision.beta.ops
import
augment
from
official.vision.beta.ops
import
preprocess_ops
MEAN_RGB
=
(
0.485
*
255
,
0.456
*
255
,
0.406
*
255
)
STDDEV_RGB
=
(
0.229
*
255
,
0.224
*
255
,
0.225
*
255
)
DEFAULT_IMAGE_FIELD_KEY
=
'image/encoded'
DEFAULT_LABEL_FIELD_KEY
=
'image/class/label'
class
Decoder
(
decoder
.
Decoder
):
"""A tf.Example decoder for classification task."""
def
__init__
(
self
,
image_field_key
:
str
=
DEFAULT_IMAGE_FIELD_KEY
,
label_field_key
:
str
=
DEFAULT_LABEL_FIELD_KEY
,
is_multilabel
:
bool
=
False
,
keys_to_features
:
Optional
[
Dict
[
str
,
Any
]]
=
None
):
if
not
keys_to_features
:
keys_to_features
=
{
image_field_key
:
tf
.
io
.
FixedLenFeature
((),
tf
.
string
,
default_value
=
''
),
}
if
is_multilabel
:
keys_to_features
.
update
(
{
label_field_key
:
tf
.
io
.
VarLenFeature
(
dtype
=
tf
.
int64
)})
else
:
keys_to_features
.
update
({
label_field_key
:
tf
.
io
.
FixedLenFeature
((),
tf
.
int64
,
default_value
=-
1
)
})
self
.
_keys_to_features
=
keys_to_features
def
decode
(
self
,
serialized_example
):
return
tf
.
io
.
parse_single_example
(
serialized_example
,
self
.
_keys_to_features
)
class
Parser
(
parser
.
Parser
):
"""Parser to parse an image and its annotations into a dictionary of tensors."""
def
__init__
(
self
,
output_size
:
List
[
int
],
num_classes
:
float
,
image_field_key
:
str
=
DEFAULT_IMAGE_FIELD_KEY
,
label_field_key
:
str
=
DEFAULT_LABEL_FIELD_KEY
,
decode_jpeg_only
:
bool
=
True
,
aug_rand_hflip
:
bool
=
True
,
aug_type
:
Optional
[
common
.
Augmentation
]
=
None
,
color_jitter
:
float
=
0.
,
random_erasing
:
Optional
[
common
.
RandomErasing
]
=
None
,
is_multilabel
:
bool
=
False
,
dtype
:
str
=
'float32'
):
"""Initializes parameters for parsing annotations in the dataset.
Args:
output_size: `Tensor` or `list` for [height, width] of output image. The
output_size should be divided by the largest feature stride 2^max_level.
num_classes: `float`, number of classes.
image_field_key: `str`, the key name to encoded image in tf.Example.
label_field_key: `str`, the key name to label in tf.Example.
decode_jpeg_only: `bool`, if True, only JPEG format is decoded, this is
faster than decoding other types. Default is True.
aug_rand_hflip: `bool`, if True, augment training with random
horizontal flip.
aug_type: An optional Augmentation object to choose from AutoAugment and
RandAugment.
color_jitter: Magnitude of color jitter. If > 0, the value is used to
generate random scale factor for brightness, contrast and saturation.
See `preprocess_ops.color_jitter` for more details.
random_erasing: if not None, augment input image by random erasing. See
`augment.RandomErasing` for more details.
is_multilabel: A `bool`, whether or not each example has multiple labels.
dtype: `str`, cast output image in dtype. It can be 'float32', 'float16',
or 'bfloat16'.
"""
self
.
_output_size
=
output_size
self
.
_aug_rand_hflip
=
aug_rand_hflip
self
.
_num_classes
=
num_classes
self
.
_image_field_key
=
image_field_key
if
dtype
==
'float32'
:
self
.
_dtype
=
tf
.
float32
elif
dtype
==
'float16'
:
self
.
_dtype
=
tf
.
float16
elif
dtype
==
'bfloat16'
:
self
.
_dtype
=
tf
.
bfloat16
else
:
raise
ValueError
(
'dtype {!r} is not supported!'
.
format
(
dtype
))
if
aug_type
:
if
aug_type
.
type
==
'autoaug'
:
self
.
_augmenter
=
augment
.
AutoAugment
(
augmentation_name
=
aug_type
.
autoaug
.
augmentation_name
,
cutout_const
=
aug_type
.
autoaug
.
cutout_const
,
translate_const
=
aug_type
.
autoaug
.
translate_const
)
elif
aug_type
.
type
==
'randaug'
:
self
.
_augmenter
=
augment
.
RandAugment
(
num_layers
=
aug_type
.
randaug
.
num_layers
,
magnitude
=
aug_type
.
randaug
.
magnitude
,
cutout_const
=
aug_type
.
randaug
.
cutout_const
,
translate_const
=
aug_type
.
randaug
.
translate_const
,
prob_to_apply
=
aug_type
.
randaug
.
prob_to_apply
,
exclude_ops
=
aug_type
.
randaug
.
exclude_ops
)
else
:
raise
ValueError
(
'Augmentation policy {} not supported.'
.
format
(
aug_type
.
type
))
else
:
self
.
_augmenter
=
None
self
.
_label_field_key
=
label_field_key
self
.
_color_jitter
=
color_jitter
if
random_erasing
:
self
.
_random_erasing
=
augment
.
RandomErasing
(
probability
=
random_erasing
.
probability
,
min_area
=
random_erasing
.
min_area
,
max_area
=
random_erasing
.
max_area
,
min_aspect
=
random_erasing
.
min_aspect
,
max_aspect
=
random_erasing
.
max_aspect
,
min_count
=
random_erasing
.
min_count
,
max_count
=
random_erasing
.
max_count
,
trials
=
random_erasing
.
trials
)
else
:
self
.
_random_erasing
=
None
self
.
_is_multilabel
=
is_multilabel
self
.
_decode_jpeg_only
=
decode_jpeg_only
def
_parse_train_data
(
self
,
decoded_tensors
):
"""Parses data for training."""
image
=
self
.
_parse_train_image
(
decoded_tensors
)
label
=
tf
.
cast
(
decoded_tensors
[
self
.
_label_field_key
],
dtype
=
tf
.
int32
)
if
self
.
_is_multilabel
:
if
isinstance
(
label
,
tf
.
sparse
.
SparseTensor
):
label
=
tf
.
sparse
.
to_dense
(
label
)
label
=
tf
.
reduce_sum
(
tf
.
one_hot
(
label
,
self
.
_num_classes
),
axis
=
0
)
return
image
,
label
def
_parse_eval_data
(
self
,
decoded_tensors
):
"""Parses data for evaluation."""
image
=
self
.
_parse_eval_image
(
decoded_tensors
)
label
=
tf
.
cast
(
decoded_tensors
[
self
.
_label_field_key
],
dtype
=
tf
.
int32
)
if
self
.
_is_multilabel
:
if
isinstance
(
label
,
tf
.
sparse
.
SparseTensor
):
label
=
tf
.
sparse
.
to_dense
(
label
)
label
=
tf
.
reduce_sum
(
tf
.
one_hot
(
label
,
self
.
_num_classes
),
axis
=
0
)
return
image
,
label
def
_parse_train_image
(
self
,
decoded_tensors
):
"""Parses image data for training."""
image_bytes
=
decoded_tensors
[
self
.
_image_field_key
]
if
self
.
_decode_jpeg_only
:
image_shape
=
tf
.
image
.
extract_jpeg_shape
(
image_bytes
)
# Crops image.
cropped_image
=
preprocess_ops
.
random_crop_image_v2
(
image_bytes
,
image_shape
)
image
=
tf
.
cond
(
tf
.
reduce_all
(
tf
.
equal
(
tf
.
shape
(
cropped_image
),
image_shape
)),
lambda
:
preprocess_ops
.
center_crop_image_v2
(
image_bytes
,
image_shape
),
lambda
:
cropped_image
)
else
:
# Decodes image.
image
=
tf
.
io
.
decode_image
(
image_bytes
,
channels
=
3
)
image
.
set_shape
([
None
,
None
,
3
])
# Crops image.
cropped_image
=
preprocess_ops
.
random_crop_image
(
image
)
image
=
tf
.
cond
(
tf
.
reduce_all
(
tf
.
equal
(
tf
.
shape
(
cropped_image
),
tf
.
shape
(
image
))),
lambda
:
preprocess_ops
.
center_crop_image
(
image
),
lambda
:
cropped_image
)
if
self
.
_aug_rand_hflip
:
image
=
tf
.
image
.
random_flip_left_right
(
image
)
# Color jitter.
if
self
.
_color_jitter
>
0
:
image
=
preprocess_ops
.
color_jitter
(
image
,
self
.
_color_jitter
,
self
.
_color_jitter
,
self
.
_color_jitter
)
# Resizes image.
image
=
tf
.
image
.
resize
(
image
,
self
.
_output_size
,
method
=
tf
.
image
.
ResizeMethod
.
BILINEAR
)
image
.
set_shape
([
self
.
_output_size
[
0
],
self
.
_output_size
[
1
],
3
])
# Apply autoaug or randaug.
if
self
.
_augmenter
is
not
None
:
image
=
self
.
_augmenter
.
distort
(
image
)
# Normalizes image with mean and std pixel values.
image
=
preprocess_ops
.
normalize_image
(
image
,
offset
=
MEAN_RGB
,
scale
=
STDDEV_RGB
)
# Random erasing after the image has been normalized
if
self
.
_random_erasing
is
not
None
:
image
=
self
.
_random_erasing
.
distort
(
image
)
# Convert image to self._dtype.
image
=
tf
.
image
.
convert_image_dtype
(
image
,
self
.
_dtype
)
return
image
def
_parse_eval_image
(
self
,
decoded_tensors
):
"""Parses image data for evaluation."""
image_bytes
=
decoded_tensors
[
self
.
_image_field_key
]
if
self
.
_decode_jpeg_only
:
image_shape
=
tf
.
image
.
extract_jpeg_shape
(
image_bytes
)
# Center crops.
image
=
preprocess_ops
.
center_crop_image_v2
(
image_bytes
,
image_shape
)
else
:
# Decodes image.
image
=
tf
.
io
.
decode_image
(
image_bytes
,
channels
=
3
)
image
.
set_shape
([
None
,
None
,
3
])
# Center crops.
image
=
preprocess_ops
.
center_crop_image
(
image
)
image
=
tf
.
image
.
resize
(
image
,
self
.
_output_size
,
method
=
tf
.
image
.
ResizeMethod
.
BILINEAR
)
image
.
set_shape
([
self
.
_output_size
[
0
],
self
.
_output_size
[
1
],
3
])
# Normalizes image with mean and std pixel values.
image
=
preprocess_ops
.
normalize_image
(
image
,
offset
=
MEAN_RGB
,
scale
=
STDDEV_RGB
)
# Convert image to self._dtype.
image
=
tf
.
image
.
convert_image_dtype
(
image
,
self
.
_dtype
)
return
image
@
classmethod
def
inference_fn
(
cls
,
image
:
tf
.
Tensor
,
input_image_size
:
List
[
int
],
num_channels
:
int
=
3
)
->
tf
.
Tensor
:
"""Builds image model inputs for serving."""
image
=
tf
.
cast
(
image
,
dtype
=
tf
.
float32
)
image
=
preprocess_ops
.
center_crop_image
(
image
)
image
=
tf
.
image
.
resize
(
image
,
input_image_size
,
method
=
tf
.
image
.
ResizeMethod
.
BILINEAR
)
# Normalizes image with mean and std pixel values.
image
=
preprocess_ops
.
normalize_image
(
image
,
offset
=
MEAN_RGB
,
scale
=
STDDEV_RGB
)
image
.
set_shape
(
input_image_size
+
[
num_channels
])
return
image
official/vision/beta/dataloaders/decoder.py
deleted
100644 → 0
View file @
f47405b5
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The generic decoder interface."""
import
abc
class
Decoder
(
object
):
"""Decodes the raw data into tensors."""
__metaclass__
=
abc
.
ABCMeta
@
abc
.
abstractmethod
def
decode
(
self
,
serialized_example
):
"""Decodes the serialized example into tensors.
Args:
serialized_example: a serialized string tensor that encodes the data.
Returns:
decoded_tensors: a dict of Tensors.
"""
pass
official/vision/beta/dataloaders/input_reader.py
deleted
100644 → 0
View file @
f47405b5
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Dataset reader for vision model garden."""
from
typing
import
Any
,
Callable
,
Optional
,
Tuple
import
tensorflow
as
tf
from
official.core
import
config_definitions
as
cfg
from
official.core
import
input_reader
def
calculate_batch_sizes
(
total_batch_size
:
int
,
pseudo_label_ratio
:
float
)
->
Tuple
[
int
,
int
]:
"""Calculates labeled and pseudo-labeled dataset batch sizes.
Returns (labeled_batch_size, pseudo_labeled_batch_size) given a
total batch size and pseudo-label data ratio.
Args:
total_batch_size: The total batch size for all data.
pseudo_label_ratio: A non-negative float ratio of pseudo-labeled
to labeled data in a batch.
Returns:
(labeled_batch_size, pseudo_labeled_batch_size) as ints.
Raises:
ValueError: If total_batch_size is negative.
ValueError: If pseudo_label_ratio is negative.
"""
if
total_batch_size
<
0
:
raise
ValueError
(
'Invalid total_batch_size: {}'
.
format
(
total_batch_size
))
if
pseudo_label_ratio
<
0.0
:
raise
ValueError
(
'Invalid pseudo_label_ratio: {}'
.
format
(
pseudo_label_ratio
))
ratio_factor
=
pseudo_label_ratio
/
(
1.0
+
pseudo_label_ratio
)
pseudo_labeled_batch_size
=
int
(
round
(
total_batch_size
*
ratio_factor
))
labeled_batch_size
=
total_batch_size
-
pseudo_labeled_batch_size
return
labeled_batch_size
,
pseudo_labeled_batch_size
class
CombinationDatasetInputReader
(
input_reader
.
InputReader
):
"""Combination dataset input reader."""
def
__init__
(
self
,
params
:
cfg
.
DataConfig
,
dataset_fn
=
tf
.
data
.
TFRecordDataset
,
pseudo_label_dataset_fn
=
tf
.
data
.
TFRecordDataset
,
decoder_fn
:
Optional
[
Callable
[...,
Any
]]
=
None
,
sample_fn
:
Optional
[
Callable
[...,
Any
]]
=
None
,
parser_fn
:
Optional
[
Callable
[...,
Any
]]
=
None
,
transform_and_batch_fn
:
Optional
[
Callable
[
[
tf
.
data
.
Dataset
,
Optional
[
tf
.
distribute
.
InputContext
]],
tf
.
data
.
Dataset
]]
=
None
,
postprocess_fn
:
Optional
[
Callable
[...,
Any
]]
=
None
):
"""Initializes an CombinationDatasetInputReader instance.
This class mixes a labeled and pseudo-labeled dataset. The params
must contain "pseudo_label_data.input_path" to specify the
pseudo-label dataset files and "pseudo_label_data.data_ratio"
to specify a per-batch mixing ratio of pseudo-label examples to
labeled dataset examples.
Args:
params: A config_definitions.DataConfig object.
dataset_fn: A `tf.data.Dataset` that consumes the input files. For
example, it can be `tf.data.TFRecordDataset`.
pseudo_label_dataset_fn: A `tf.data.Dataset` that consumes the input
files. For example, it can be `tf.data.TFRecordDataset`.
decoder_fn: An optional `callable` that takes the serialized data string
and decodes them into the raw tensor dictionary.
sample_fn: An optional `callable` that takes a `tf.data.Dataset` object as
input and outputs the transformed dataset. It performs sampling on the
decoded raw tensors dict before the parser_fn.
parser_fn: An optional `callable` that takes the decoded raw tensors dict
and parse them into a dictionary of tensors that can be consumed by the
model. It will be executed after decoder_fn.
transform_and_batch_fn: An optional `callable` that takes a
`tf.data.Dataset` object and an optional `tf.distribute.InputContext` as
input, and returns a `tf.data.Dataset` object. It will be executed after
`parser_fn` to transform and batch the dataset; if None, after
`parser_fn` is executed, the dataset will be batched into per-replica
batch size.
postprocess_fn: A optional `callable` that processes batched tensors. It
will be executed after batching.
Raises:
ValueError: If drop_remainder is False.
"""
super
().
__init__
(
params
=
params
,
dataset_fn
=
dataset_fn
,
decoder_fn
=
decoder_fn
,
sample_fn
=
sample_fn
,
parser_fn
=
parser_fn
,
transform_and_batch_fn
=
transform_and_batch_fn
,
postprocess_fn
=
postprocess_fn
)
self
.
_pseudo_label_file_pattern
=
params
.
pseudo_label_data
.
input_path
self
.
_pseudo_label_dataset_fn
=
pseudo_label_dataset_fn
self
.
_pseudo_label_data_ratio
=
params
.
pseudo_label_data
.
data_ratio
self
.
_pseudo_label_matched_files
=
input_reader
.
match_files
(
self
.
_pseudo_label_file_pattern
)
if
not
self
.
_drop_remainder
:
raise
ValueError
(
'Must use drop_remainder=True with CombinationDatasetInputReader'
)
def
read
(
self
,
input_context
:
Optional
[
tf
.
distribute
.
InputContext
]
=
None
)
->
tf
.
data
.
Dataset
:
"""Generates a tf.data.Dataset object."""
labeled_batch_size
,
pl_batch_size
=
calculate_batch_sizes
(
self
.
_global_batch_size
,
self
.
_pseudo_label_data_ratio
)
if
not
labeled_batch_size
and
pl_batch_size
:
raise
ValueError
(
'Invalid batch_size: {} and pseudo_label_data_ratio: {}, '
'resulting in a 0 batch size for one of the datasets.'
.
format
(
self
.
_global_batch_size
,
self
.
_pseudo_label_data_ratio
))
def
_read_decode_and_parse_dataset
(
matched_files
,
dataset_fn
,
batch_size
,
input_context
,
tfds_builder
):
dataset
=
self
.
_read_data_source
(
matched_files
,
dataset_fn
,
input_context
,
tfds_builder
)
return
self
.
_decode_and_parse_dataset
(
dataset
,
batch_size
,
input_context
)
labeled_dataset
=
_read_decode_and_parse_dataset
(
matched_files
=
self
.
_matched_files
,
dataset_fn
=
self
.
_dataset_fn
,
batch_size
=
labeled_batch_size
,
input_context
=
input_context
,
tfds_builder
=
self
.
_tfds_builder
)
pseudo_labeled_dataset
=
_read_decode_and_parse_dataset
(
matched_files
=
self
.
_pseudo_label_matched_files
,
dataset_fn
=
self
.
_pseudo_label_dataset_fn
,
batch_size
=
pl_batch_size
,
input_context
=
input_context
,
tfds_builder
=
False
)
def
concat_fn
(
d1
,
d2
):
return
tf
.
nest
.
map_structure
(
lambda
x1
,
x2
:
tf
.
concat
([
x1
,
x2
],
axis
=
0
),
d1
,
d2
)
dataset_concat
=
tf
.
data
.
Dataset
.
zip
(
(
labeled_dataset
,
pseudo_labeled_dataset
))
dataset_concat
=
dataset_concat
.
map
(
concat_fn
,
num_parallel_calls
=
tf
.
data
.
experimental
.
AUTOTUNE
)
def
maybe_map_fn
(
dataset
,
fn
):
return
dataset
if
fn
is
None
else
dataset
.
map
(
fn
,
num_parallel_calls
=
tf
.
data
.
experimental
.
AUTOTUNE
)
dataset_concat
=
maybe_map_fn
(
dataset_concat
,
self
.
_postprocess_fn
)
dataset_concat
=
self
.
_maybe_apply_data_service
(
dataset_concat
,
input_context
)
if
self
.
_deterministic
is
not
None
:
options
=
tf
.
data
.
Options
()
options
.
experimental_deterministic
=
self
.
_deterministic
dataset_concat
=
dataset_concat
.
with_options
(
options
)
return
dataset_concat
.
prefetch
(
tf
.
data
.
experimental
.
AUTOTUNE
)
official/vision/beta/dataloaders/input_reader_factory.py
deleted
100644 → 0
View file @
f47405b5
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Factory for getting TF-Vision input readers."""
from
official.common
import
dataset_fn
as
dataset_fn_util
from
official.core
import
config_definitions
as
cfg
from
official.core
import
input_reader
as
core_input_reader
from
official.vision.beta.dataloaders
import
input_reader
as
vision_input_reader
def
input_reader_generator
(
params
:
cfg
.
DataConfig
,
**
kwargs
)
->
core_input_reader
.
InputReader
:
"""Instantiates an input reader class according to the params.
Args:
params: A config_definitions.DataConfig object.
**kwargs: Additional arguments passed to input reader initialization.
Returns:
An InputReader object.
"""
if
params
.
is_training
and
params
.
get
(
'pseudo_label_data'
,
False
):
return
vision_input_reader
.
CombinationDatasetInputReader
(
params
,
pseudo_label_dataset_fn
=
dataset_fn_util
.
pick_dataset_fn
(
params
.
pseudo_label_data
.
file_type
),
**
kwargs
)
else
:
return
core_input_reader
.
InputReader
(
params
,
**
kwargs
)
official/vision/beta/dataloaders/maskrcnn_input.py
deleted
100644 → 0
View file @
f47405b5
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data parser and processing for Mask R-CNN."""
# Import libraries
import
tensorflow
as
tf
from
official.vision.beta.dataloaders
import
parser
from
official.vision.beta.dataloaders
import
utils
from
official.vision.beta.ops
import
anchor
from
official.vision.beta.ops
import
box_ops
from
official.vision.beta.ops
import
preprocess_ops
class
Parser
(
parser
.
Parser
):
"""Parser to parse an image and its annotations into a dictionary of tensors."""
def
__init__
(
self
,
output_size
,
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
,
rpn_match_threshold
=
0.7
,
rpn_unmatched_threshold
=
0.3
,
rpn_batch_size_per_im
=
256
,
rpn_fg_fraction
=
0.5
,
aug_rand_hflip
=
False
,
aug_scale_min
=
1.0
,
aug_scale_max
=
1.0
,
skip_crowd_during_training
=
True
,
max_num_instances
=
100
,
include_mask
=
False
,
mask_crop_size
=
112
,
dtype
=
'float32'
):
"""Initializes parameters for parsing annotations in the dataset.
Args:
output_size: `Tensor` or `list` for [height, width] of output image. The
output_size should be divided by the largest feature stride 2^max_level.
min_level: `int` number of minimum level of the output feature pyramid.
max_level: `int` number of maximum level of the output feature pyramid.
num_scales: `int` number representing intermediate scales added
on each level. For instances, num_scales=2 adds one additional
intermediate anchor scales [2^0, 2^0.5] on each level.
aspect_ratios: `list` of float numbers representing the aspect raito
anchors added on each level. The number indicates the ratio of width to
height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
on each scale level.
anchor_size: `float` number representing the scale of size of the base
anchor to the feature stride 2^level.
rpn_match_threshold:
rpn_unmatched_threshold:
rpn_batch_size_per_im:
rpn_fg_fraction:
aug_rand_hflip: `bool`, if True, augment training with random
horizontal flip.
aug_scale_min: `float`, the minimum scale applied to `output_size` for
data augmentation during training.
aug_scale_max: `float`, the maximum scale applied to `output_size` for
data augmentation during training.
skip_crowd_during_training: `bool`, if True, skip annotations labeled with
`is_crowd` equals to 1.
max_num_instances: `int` number of maximum number of instances in an
image. The groundtruth data will be padded to `max_num_instances`.
include_mask: a bool to indicate whether parse mask groundtruth.
mask_crop_size: the size which groundtruth mask is cropped to.
dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
"""
self
.
_max_num_instances
=
max_num_instances
self
.
_skip_crowd_during_training
=
skip_crowd_during_training
# Anchor.
self
.
_output_size
=
output_size
self
.
_min_level
=
min_level
self
.
_max_level
=
max_level
self
.
_num_scales
=
num_scales
self
.
_aspect_ratios
=
aspect_ratios
self
.
_anchor_size
=
anchor_size
# Target assigning.
self
.
_rpn_match_threshold
=
rpn_match_threshold
self
.
_rpn_unmatched_threshold
=
rpn_unmatched_threshold
self
.
_rpn_batch_size_per_im
=
rpn_batch_size_per_im
self
.
_rpn_fg_fraction
=
rpn_fg_fraction
# Data augmentation.
self
.
_aug_rand_hflip
=
aug_rand_hflip
self
.
_aug_scale_min
=
aug_scale_min
self
.
_aug_scale_max
=
aug_scale_max
# Mask.
self
.
_include_mask
=
include_mask
self
.
_mask_crop_size
=
mask_crop_size
# Image output dtype.
self
.
_dtype
=
dtype
def
_parse_train_data
(
self
,
data
):
"""Parses data for training.
Args:
data: the decoded tensor dictionary from TfExampleDecoder.
Returns:
image: image tensor that is preproessed to have normalized value and
dimension [output_size[0], output_size[1], 3]
labels: a dictionary of tensors used for training. The following describes
{key: value} pairs in the dictionary.
image_info: a 2D `Tensor` that encodes the information of the image and
the applied preprocessing. It is in the format of
[[original_height, original_width], [scaled_height, scaled_width],
anchor_boxes: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, 4] representing anchor boxes at each level.
rpn_score_targets: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, anchors_per_location]. The height_l and
width_l represent the dimension of class logits at l-th level.
rpn_box_targets: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, anchors_per_location * 4]. The height_l and
width_l represent the dimension of bounding box regression output at
l-th level.
gt_boxes: Groundtruth bounding box annotations. The box is represented
in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled
image that is fed to the network. The tennsor is padded with -1 to
the fixed dimension [self._max_num_instances, 4].
gt_classes: Groundtruth classes annotations. The tennsor is padded
with -1 to the fixed dimension [self._max_num_instances].
gt_masks: groundtrugh masks cropped by the bounding box and
resized to a fixed size determined by mask_crop_size.
"""
classes
=
data
[
'groundtruth_classes'
]
boxes
=
data
[
'groundtruth_boxes'
]
if
self
.
_include_mask
:
masks
=
data
[
'groundtruth_instance_masks'
]
is_crowds
=
data
[
'groundtruth_is_crowd'
]
# Skips annotations with `is_crowd` = True.
if
self
.
_skip_crowd_during_training
:
num_groundtruths
=
tf
.
shape
(
classes
)[
0
]
with
tf
.
control_dependencies
([
num_groundtruths
,
is_crowds
]):
indices
=
tf
.
cond
(
tf
.
greater
(
tf
.
size
(
is_crowds
),
0
),
lambda
:
tf
.
where
(
tf
.
logical_not
(
is_crowds
))[:,
0
],
lambda
:
tf
.
cast
(
tf
.
range
(
num_groundtruths
),
tf
.
int64
))
classes
=
tf
.
gather
(
classes
,
indices
)
boxes
=
tf
.
gather
(
boxes
,
indices
)
if
self
.
_include_mask
:
masks
=
tf
.
gather
(
masks
,
indices
)
# Gets original image and its size.
image
=
data
[
'image'
]
image_shape
=
tf
.
shape
(
image
)[
0
:
2
]
# Normalizes image with mean and std pixel values.
image
=
preprocess_ops
.
normalize_image
(
image
)
# Flips image randomly during training.
if
self
.
_aug_rand_hflip
:
if
self
.
_include_mask
:
image
,
boxes
,
masks
=
preprocess_ops
.
random_horizontal_flip
(
image
,
boxes
,
masks
)
else
:
image
,
boxes
,
_
=
preprocess_ops
.
random_horizontal_flip
(
image
,
boxes
)
# Converts boxes from normalized coordinates to pixel coordinates.
# Now the coordinates of boxes are w.r.t. the original image.
boxes
=
box_ops
.
denormalize_boxes
(
boxes
,
image_shape
)
# Resizes and crops image.
image
,
image_info
=
preprocess_ops
.
resize_and_crop_image
(
image
,
self
.
_output_size
,
padded_size
=
preprocess_ops
.
compute_padded_size
(
self
.
_output_size
,
2
**
self
.
_max_level
),
aug_scale_min
=
self
.
_aug_scale_min
,
aug_scale_max
=
self
.
_aug_scale_max
)
image_height
,
image_width
,
_
=
image
.
get_shape
().
as_list
()
# Resizes and crops boxes.
# Now the coordinates of boxes are w.r.t the scaled image.
image_scale
=
image_info
[
2
,
:]
offset
=
image_info
[
3
,
:]
boxes
=
preprocess_ops
.
resize_and_crop_boxes
(
boxes
,
image_scale
,
image_info
[
1
,
:],
offset
)
# Filters out ground truth boxes that are all zeros.
indices
=
box_ops
.
get_non_empty_box_indices
(
boxes
)
boxes
=
tf
.
gather
(
boxes
,
indices
)
classes
=
tf
.
gather
(
classes
,
indices
)
if
self
.
_include_mask
:
masks
=
tf
.
gather
(
masks
,
indices
)
# Transfer boxes to the original image space and do normalization.
cropped_boxes
=
boxes
+
tf
.
tile
(
tf
.
expand_dims
(
offset
,
axis
=
0
),
[
1
,
2
])
cropped_boxes
/=
tf
.
tile
(
tf
.
expand_dims
(
image_scale
,
axis
=
0
),
[
1
,
2
])
cropped_boxes
=
box_ops
.
normalize_boxes
(
cropped_boxes
,
image_shape
)
num_masks
=
tf
.
shape
(
masks
)[
0
]
masks
=
tf
.
image
.
crop_and_resize
(
tf
.
expand_dims
(
masks
,
axis
=-
1
),
cropped_boxes
,
box_indices
=
tf
.
range
(
num_masks
,
dtype
=
tf
.
int32
),
crop_size
=
[
self
.
_mask_crop_size
,
self
.
_mask_crop_size
],
method
=
'bilinear'
)
masks
=
tf
.
squeeze
(
masks
,
axis
=-
1
)
# Assigns anchor targets.
# Note that after the target assignment, box targets are absolute pixel
# offsets w.r.t. the scaled image.
input_anchor
=
anchor
.
build_anchor_generator
(
min_level
=
self
.
_min_level
,
max_level
=
self
.
_max_level
,
num_scales
=
self
.
_num_scales
,
aspect_ratios
=
self
.
_aspect_ratios
,
anchor_size
=
self
.
_anchor_size
)
anchor_boxes
=
input_anchor
(
image_size
=
(
image_height
,
image_width
))
anchor_labeler
=
anchor
.
RpnAnchorLabeler
(
self
.
_rpn_match_threshold
,
self
.
_rpn_unmatched_threshold
,
self
.
_rpn_batch_size_per_im
,
self
.
_rpn_fg_fraction
)
rpn_score_targets
,
rpn_box_targets
=
anchor_labeler
.
label_anchors
(
anchor_boxes
,
boxes
,
tf
.
cast
(
tf
.
expand_dims
(
classes
,
axis
=-
1
),
dtype
=
tf
.
float32
))
# Casts input image to self._dtype
image
=
tf
.
cast
(
image
,
dtype
=
self
.
_dtype
)
# Packs labels for model_fn outputs.
labels
=
{
'anchor_boxes'
:
anchor_boxes
,
'image_info'
:
image_info
,
'rpn_score_targets'
:
rpn_score_targets
,
'rpn_box_targets'
:
rpn_box_targets
,
'gt_boxes'
:
preprocess_ops
.
clip_or_pad_to_fixed_size
(
boxes
,
self
.
_max_num_instances
,
-
1
),
'gt_classes'
:
preprocess_ops
.
clip_or_pad_to_fixed_size
(
classes
,
self
.
_max_num_instances
,
-
1
),
}
if
self
.
_include_mask
:
labels
[
'gt_masks'
]
=
preprocess_ops
.
clip_or_pad_to_fixed_size
(
masks
,
self
.
_max_num_instances
,
-
1
)
return
image
,
labels
def
_parse_eval_data
(
self
,
data
):
"""Parses data for evaluation.
Args:
data: the decoded tensor dictionary from TfExampleDecoder.
Returns:
A dictionary of {'images': image, 'labels': labels} where
image: image tensor that is preproessed to have normalized value and
dimension [output_size[0], output_size[1], 3]
labels: a dictionary of tensors used for training. The following
describes {key: value} pairs in the dictionary.
source_ids: Source image id. Default value -1 if the source id is
empty in the groundtruth annotation.
image_info: a 2D `Tensor` that encodes the information of the image
and the applied preprocessing. It is in the format of
[[original_height, original_width], [scaled_height, scaled_width],
anchor_boxes: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, 4] representing anchor boxes at each
level.
"""
# Gets original image and its size.
image
=
data
[
'image'
]
image_shape
=
tf
.
shape
(
image
)[
0
:
2
]
# Normalizes image with mean and std pixel values.
image
=
preprocess_ops
.
normalize_image
(
image
)
# Resizes and crops image.
image
,
image_info
=
preprocess_ops
.
resize_and_crop_image
(
image
,
self
.
_output_size
,
padded_size
=
preprocess_ops
.
compute_padded_size
(
self
.
_output_size
,
2
**
self
.
_max_level
),
aug_scale_min
=
1.0
,
aug_scale_max
=
1.0
)
image_height
,
image_width
,
_
=
image
.
get_shape
().
as_list
()
# Casts input image to self._dtype
image
=
tf
.
cast
(
image
,
dtype
=
self
.
_dtype
)
# Converts boxes from normalized coordinates to pixel coordinates.
boxes
=
box_ops
.
denormalize_boxes
(
data
[
'groundtruth_boxes'
],
image_shape
)
# Compute Anchor boxes.
input_anchor
=
anchor
.
build_anchor_generator
(
min_level
=
self
.
_min_level
,
max_level
=
self
.
_max_level
,
num_scales
=
self
.
_num_scales
,
aspect_ratios
=
self
.
_aspect_ratios
,
anchor_size
=
self
.
_anchor_size
)
anchor_boxes
=
input_anchor
(
image_size
=
(
image_height
,
image_width
))
labels
=
{
'image_info'
:
image_info
,
'anchor_boxes'
:
anchor_boxes
,
}
groundtruths
=
{
'source_id'
:
data
[
'source_id'
],
'height'
:
data
[
'height'
],
'width'
:
data
[
'width'
],
'num_detections'
:
tf
.
shape
(
data
[
'groundtruth_classes'
])[
0
],
'boxes'
:
boxes
,
'classes'
:
data
[
'groundtruth_classes'
],
'areas'
:
data
[
'groundtruth_area'
],
'is_crowds'
:
tf
.
cast
(
data
[
'groundtruth_is_crowd'
],
tf
.
int32
),
}
groundtruths
[
'source_id'
]
=
utils
.
process_source_id
(
groundtruths
[
'source_id'
])
groundtruths
=
utils
.
pad_groundtruths_to_fixed_size
(
groundtruths
,
self
.
_max_num_instances
)
labels
[
'groundtruths'
]
=
groundtruths
return
image
,
labels
official/vision/beta/dataloaders/parser.py
deleted
100644 → 0
View file @
f47405b5
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The generic parser interface."""
import
abc
class
Parser
(
object
):
"""Parses data and produces tensors to be consumed by models."""
__metaclass__
=
abc
.
ABCMeta
@
abc
.
abstractmethod
def
_parse_train_data
(
self
,
decoded_tensors
):
"""Generates images and labels that are usable for model training.
Args:
decoded_tensors: a dict of Tensors produced by the decoder.
Returns:
images: the image tensor.
labels: a dict of Tensors that contains labels.
"""
pass
@
abc
.
abstractmethod
def
_parse_eval_data
(
self
,
decoded_tensors
):
"""Generates images and labels that are usable for model evaluation.
Args:
decoded_tensors: a dict of Tensors produced by the decoder.
Returns:
images: the image tensor.
labels: a dict of Tensors that contains labels.
"""
pass
def
parse_fn
(
self
,
is_training
):
"""Returns a parse fn that reads and parses raw tensors from the decoder.
Args:
is_training: a `bool` to indicate whether it is in training mode.
Returns:
parse: a `callable` that takes the serialized example and generate the
images, labels tuple where labels is a dict of Tensors that contains
labels.
"""
def
parse
(
decoded_tensors
):
"""Parses the serialized example data."""
if
is_training
:
return
self
.
_parse_train_data
(
decoded_tensors
)
else
:
return
self
.
_parse_eval_data
(
decoded_tensors
)
return
parse
@
classmethod
def
inference_fn
(
cls
,
inputs
):
"""Parses inputs for predictions.
Args:
inputs: A Tensor, or dictionary of Tensors.
Returns:
processed_inputs: An input tensor to the model.
"""
pass
official/vision/beta/dataloaders/retinanet_input.py
deleted
100644 → 0
View file @
f47405b5
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data parser and processing for RetinaNet.
Parse image and ground truths in a dataset to training targets and package them
into (image, labels) tuple for RetinaNet.
"""
# Import libraries
from
absl
import
logging
import
tensorflow
as
tf
from
official.vision.beta.dataloaders
import
parser
from
official.vision.beta.dataloaders
import
utils
from
official.vision.beta.ops
import
anchor
from
official.vision.beta.ops
import
augment
from
official.vision.beta.ops
import
box_ops
from
official.vision.beta.ops
import
preprocess_ops
class
Parser
(
parser
.
Parser
):
"""Parser to parse an image and its annotations into a dictionary of tensors."""
def
__init__
(
self
,
output_size
,
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
,
match_threshold
=
0.5
,
unmatched_threshold
=
0.5
,
aug_type
=
None
,
aug_rand_hflip
=
False
,
aug_scale_min
=
1.0
,
aug_scale_max
=
1.0
,
use_autoaugment
=
False
,
autoaugment_policy_name
=
'v0'
,
skip_crowd_during_training
=
True
,
max_num_instances
=
100
,
dtype
=
'bfloat16'
,
mode
=
None
):
"""Initializes parameters for parsing annotations in the dataset.
Args:
output_size: `Tensor` or `list` for [height, width] of output image. The
output_size should be divided by the largest feature stride 2^max_level.
min_level: `int` number of minimum level of the output feature pyramid.
max_level: `int` number of maximum level of the output feature pyramid.
num_scales: `int` number representing intermediate scales added on each
level. For instances, num_scales=2 adds one additional intermediate
anchor scales [2^0, 2^0.5] on each level.
aspect_ratios: `list` of float numbers representing the aspect raito
anchors added on each level. The number indicates the ratio of width to
height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
on each scale level.
anchor_size: `float` number representing the scale of size of the base
anchor to the feature stride 2^level.
match_threshold: `float` number between 0 and 1 representing the
lower-bound threshold to assign positive labels for anchors. An anchor
with a score over the threshold is labeled positive.
unmatched_threshold: `float` number between 0 and 1 representing the
upper-bound threshold to assign negative labels for anchors. An anchor
with a score below the threshold is labeled negative.
aug_type: An optional Augmentation object to choose from AutoAugment and
RandAugment.
aug_rand_hflip: `bool`, if True, augment training with random horizontal
flip.
aug_scale_min: `float`, the minimum scale applied to `output_size` for
data augmentation during training.
aug_scale_max: `float`, the maximum scale applied to `output_size` for
data augmentation during training.
use_autoaugment: `bool`, if True, use the AutoAugment augmentation policy
during training.
autoaugment_policy_name: `string` that specifies the name of the
AutoAugment policy that will be used during training.
skip_crowd_during_training: `bool`, if True, skip annotations labeled with
`is_crowd` equals to 1.
max_num_instances: `int` number of maximum number of instances in an
image. The groundtruth data will be padded to `max_num_instances`.
dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
mode: a ModeKeys. Specifies if this is training, evaluation, prediction or
prediction with groundtruths in the outputs.
"""
self
.
_mode
=
mode
self
.
_max_num_instances
=
max_num_instances
self
.
_skip_crowd_during_training
=
skip_crowd_during_training
# Anchor.
self
.
_output_size
=
output_size
self
.
_min_level
=
min_level
self
.
_max_level
=
max_level
self
.
_num_scales
=
num_scales
self
.
_aspect_ratios
=
aspect_ratios
self
.
_anchor_size
=
anchor_size
self
.
_match_threshold
=
match_threshold
self
.
_unmatched_threshold
=
unmatched_threshold
# Data augmentation.
self
.
_aug_rand_hflip
=
aug_rand_hflip
self
.
_aug_scale_min
=
aug_scale_min
self
.
_aug_scale_max
=
aug_scale_max
# Data augmentation with AutoAugment or RandAugment.
self
.
_augmenter
=
None
if
aug_type
is
not
None
:
if
aug_type
.
type
==
'autoaug'
:
logging
.
info
(
'Using AutoAugment.'
)
self
.
_augmenter
=
augment
.
AutoAugment
(
augmentation_name
=
aug_type
.
autoaug
.
augmentation_name
,
cutout_const
=
aug_type
.
autoaug
.
cutout_const
,
translate_const
=
aug_type
.
autoaug
.
translate_const
)
elif
aug_type
.
type
==
'randaug'
:
logging
.
info
(
'Using RandAugment.'
)
self
.
_augmenter
=
augment
.
RandAugment
.
build_for_detection
(
num_layers
=
aug_type
.
randaug
.
num_layers
,
magnitude
=
aug_type
.
randaug
.
magnitude
,
cutout_const
=
aug_type
.
randaug
.
cutout_const
,
translate_const
=
aug_type
.
randaug
.
translate_const
,
prob_to_apply
=
aug_type
.
randaug
.
prob_to_apply
,
exclude_ops
=
aug_type
.
randaug
.
exclude_ops
)
else
:
raise
ValueError
(
f
'Augmentation policy
{
aug_type
.
type
}
not supported.'
)
# Deprecated. Data Augmentation with AutoAugment.
self
.
_use_autoaugment
=
use_autoaugment
self
.
_autoaugment_policy_name
=
autoaugment_policy_name
# Data type.
self
.
_dtype
=
dtype
def
_parse_train_data
(
self
,
data
):
"""Parses data for training and evaluation."""
classes
=
data
[
'groundtruth_classes'
]
boxes
=
data
[
'groundtruth_boxes'
]
# If not empty, `attributes` is a dict of (name, ground_truth) pairs.
# `ground_gruth` of attributes is assumed in shape [N, attribute_size].
# TODO(xianzhi): support parsing attributes weights.
attributes
=
data
.
get
(
'groundtruth_attributes'
,
{})
is_crowds
=
data
[
'groundtruth_is_crowd'
]
# Skips annotations with `is_crowd` = True.
if
self
.
_skip_crowd_during_training
:
num_groundtrtuhs
=
tf
.
shape
(
input
=
classes
)[
0
]
with
tf
.
control_dependencies
([
num_groundtrtuhs
,
is_crowds
]):
indices
=
tf
.
cond
(
pred
=
tf
.
greater
(
tf
.
size
(
input
=
is_crowds
),
0
),
true_fn
=
lambda
:
tf
.
where
(
tf
.
logical_not
(
is_crowds
))[:,
0
],
false_fn
=
lambda
:
tf
.
cast
(
tf
.
range
(
num_groundtrtuhs
),
tf
.
int64
))
classes
=
tf
.
gather
(
classes
,
indices
)
boxes
=
tf
.
gather
(
boxes
,
indices
)
for
k
,
v
in
attributes
.
items
():
attributes
[
k
]
=
tf
.
gather
(
v
,
indices
)
# Gets original image.
image
=
data
[
'image'
]
# Apply autoaug or randaug.
if
self
.
_augmenter
is
not
None
:
image
,
boxes
=
self
.
_augmenter
.
distort_with_boxes
(
image
,
boxes
)
image_shape
=
tf
.
shape
(
input
=
image
)[
0
:
2
]
# Normalizes image with mean and std pixel values.
image
=
preprocess_ops
.
normalize_image
(
image
)
# Flips image randomly during training.
if
self
.
_aug_rand_hflip
:
image
,
boxes
,
_
=
preprocess_ops
.
random_horizontal_flip
(
image
,
boxes
)
# Converts boxes from normalized coordinates to pixel coordinates.
boxes
=
box_ops
.
denormalize_boxes
(
boxes
,
image_shape
)
# Resizes and crops image.
image
,
image_info
=
preprocess_ops
.
resize_and_crop_image
(
image
,
self
.
_output_size
,
padded_size
=
preprocess_ops
.
compute_padded_size
(
self
.
_output_size
,
2
**
self
.
_max_level
),
aug_scale_min
=
self
.
_aug_scale_min
,
aug_scale_max
=
self
.
_aug_scale_max
)
image_height
,
image_width
,
_
=
image
.
get_shape
().
as_list
()
# Resizes and crops boxes.
image_scale
=
image_info
[
2
,
:]
offset
=
image_info
[
3
,
:]
boxes
=
preprocess_ops
.
resize_and_crop_boxes
(
boxes
,
image_scale
,
image_info
[
1
,
:],
offset
)
# Filters out ground truth boxes that are all zeros.
indices
=
box_ops
.
get_non_empty_box_indices
(
boxes
)
boxes
=
tf
.
gather
(
boxes
,
indices
)
classes
=
tf
.
gather
(
classes
,
indices
)
for
k
,
v
in
attributes
.
items
():
attributes
[
k
]
=
tf
.
gather
(
v
,
indices
)
# Assigns anchors.
input_anchor
=
anchor
.
build_anchor_generator
(
min_level
=
self
.
_min_level
,
max_level
=
self
.
_max_level
,
num_scales
=
self
.
_num_scales
,
aspect_ratios
=
self
.
_aspect_ratios
,
anchor_size
=
self
.
_anchor_size
)
anchor_boxes
=
input_anchor
(
image_size
=
(
image_height
,
image_width
))
anchor_labeler
=
anchor
.
AnchorLabeler
(
self
.
_match_threshold
,
self
.
_unmatched_threshold
)
(
cls_targets
,
box_targets
,
att_targets
,
cls_weights
,
box_weights
)
=
anchor_labeler
.
label_anchors
(
anchor_boxes
,
boxes
,
tf
.
expand_dims
(
classes
,
axis
=
1
),
attributes
)
# Casts input image to desired data type.
image
=
tf
.
cast
(
image
,
dtype
=
self
.
_dtype
)
# Packs labels for model_fn outputs.
labels
=
{
'cls_targets'
:
cls_targets
,
'box_targets'
:
box_targets
,
'anchor_boxes'
:
anchor_boxes
,
'cls_weights'
:
cls_weights
,
'box_weights'
:
box_weights
,
'image_info'
:
image_info
,
}
if
att_targets
:
labels
[
'attribute_targets'
]
=
att_targets
return
image
,
labels
def
_parse_eval_data
(
self
,
data
):
"""Parses data for training and evaluation."""
groundtruths
=
{}
classes
=
data
[
'groundtruth_classes'
]
boxes
=
data
[
'groundtruth_boxes'
]
# If not empty, `attributes` is a dict of (name, ground_truth) pairs.
# `ground_gruth` of attributes is assumed in shape [N, attribute_size].
# TODO(xianzhi): support parsing attributes weights.
attributes
=
data
.
get
(
'groundtruth_attributes'
,
{})
# Gets original image and its size.
image
=
data
[
'image'
]
image_shape
=
tf
.
shape
(
input
=
image
)[
0
:
2
]
# Normalizes image with mean and std pixel values.
image
=
preprocess_ops
.
normalize_image
(
image
)
# Converts boxes from normalized coordinates to pixel coordinates.
boxes
=
box_ops
.
denormalize_boxes
(
boxes
,
image_shape
)
# Resizes and crops image.
image
,
image_info
=
preprocess_ops
.
resize_and_crop_image
(
image
,
self
.
_output_size
,
padded_size
=
preprocess_ops
.
compute_padded_size
(
self
.
_output_size
,
2
**
self
.
_max_level
),
aug_scale_min
=
1.0
,
aug_scale_max
=
1.0
)
image_height
,
image_width
,
_
=
image
.
get_shape
().
as_list
()
# Resizes and crops boxes.
image_scale
=
image_info
[
2
,
:]
offset
=
image_info
[
3
,
:]
boxes
=
preprocess_ops
.
resize_and_crop_boxes
(
boxes
,
image_scale
,
image_info
[
1
,
:],
offset
)
# Filters out ground truth boxes that are all zeros.
indices
=
box_ops
.
get_non_empty_box_indices
(
boxes
)
boxes
=
tf
.
gather
(
boxes
,
indices
)
classes
=
tf
.
gather
(
classes
,
indices
)
for
k
,
v
in
attributes
.
items
():
attributes
[
k
]
=
tf
.
gather
(
v
,
indices
)
# Assigns anchors.
input_anchor
=
anchor
.
build_anchor_generator
(
min_level
=
self
.
_min_level
,
max_level
=
self
.
_max_level
,
num_scales
=
self
.
_num_scales
,
aspect_ratios
=
self
.
_aspect_ratios
,
anchor_size
=
self
.
_anchor_size
)
anchor_boxes
=
input_anchor
(
image_size
=
(
image_height
,
image_width
))
anchor_labeler
=
anchor
.
AnchorLabeler
(
self
.
_match_threshold
,
self
.
_unmatched_threshold
)
(
cls_targets
,
box_targets
,
att_targets
,
cls_weights
,
box_weights
)
=
anchor_labeler
.
label_anchors
(
anchor_boxes
,
boxes
,
tf
.
expand_dims
(
classes
,
axis
=
1
),
attributes
)
# Casts input image to desired data type.
image
=
tf
.
cast
(
image
,
dtype
=
self
.
_dtype
)
# Sets up groundtruth data for evaluation.
groundtruths
=
{
'source_id'
:
data
[
'source_id'
],
'height'
:
data
[
'height'
],
'width'
:
data
[
'width'
],
'num_detections'
:
tf
.
shape
(
data
[
'groundtruth_classes'
]),
'image_info'
:
image_info
,
'boxes'
:
box_ops
.
denormalize_boxes
(
data
[
'groundtruth_boxes'
],
image_shape
),
'classes'
:
data
[
'groundtruth_classes'
],
'areas'
:
data
[
'groundtruth_area'
],
'is_crowds'
:
tf
.
cast
(
data
[
'groundtruth_is_crowd'
],
tf
.
int32
),
}
if
'groundtruth_attributes'
in
data
:
groundtruths
[
'attributes'
]
=
data
[
'groundtruth_attributes'
]
groundtruths
[
'source_id'
]
=
utils
.
process_source_id
(
groundtruths
[
'source_id'
])
groundtruths
=
utils
.
pad_groundtruths_to_fixed_size
(
groundtruths
,
self
.
_max_num_instances
)
# Packs labels for model_fn outputs.
labels
=
{
'cls_targets'
:
cls_targets
,
'box_targets'
:
box_targets
,
'anchor_boxes'
:
anchor_boxes
,
'cls_weights'
:
cls_weights
,
'box_weights'
:
box_weights
,
'image_info'
:
image_info
,
'groundtruths'
:
groundtruths
,
}
if
att_targets
:
labels
[
'attribute_targets'
]
=
att_targets
return
image
,
labels
official/vision/beta/dataloaders/segmentation_input.py
deleted
100644 → 0
View file @
f47405b5
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data parser and processing for segmentation datasets."""
import
tensorflow
as
tf
from
official.vision.beta.dataloaders
import
decoder
from
official.vision.beta.dataloaders
import
parser
from
official.vision.beta.ops
import
preprocess_ops
class
Decoder
(
decoder
.
Decoder
):
"""A tf.Example decoder for segmentation task."""
def
__init__
(
self
):
self
.
_keys_to_features
=
{
'image/encoded'
:
tf
.
io
.
FixedLenFeature
((),
tf
.
string
,
default_value
=
''
),
'image/height'
:
tf
.
io
.
FixedLenFeature
((),
tf
.
int64
,
default_value
=
0
),
'image/width'
:
tf
.
io
.
FixedLenFeature
((),
tf
.
int64
,
default_value
=
0
),
'image/segmentation/class/encoded'
:
tf
.
io
.
FixedLenFeature
((),
tf
.
string
,
default_value
=
''
)
}
def
decode
(
self
,
serialized_example
):
return
tf
.
io
.
parse_single_example
(
serialized_example
,
self
.
_keys_to_features
)
class
Parser
(
parser
.
Parser
):
"""Parser to parse an image and its annotations into a dictionary of tensors.
"""
def
__init__
(
self
,
output_size
,
crop_size
=
None
,
resize_eval_groundtruth
=
True
,
groundtruth_padded_size
=
None
,
ignore_label
=
255
,
aug_rand_hflip
=
False
,
preserve_aspect_ratio
=
True
,
aug_scale_min
=
1.0
,
aug_scale_max
=
1.0
,
dtype
=
'float32'
):
"""Initializes parameters for parsing annotations in the dataset.
Args:
output_size: `Tensor` or `list` for [height, width] of output image. The
output_size should be divided by the largest feature stride 2^max_level.
crop_size: `Tensor` or `list` for [height, width] of the crop. If
specified a training crop of size crop_size is returned. This is useful
for cropping original images during training while evaluating on
original image sizes.
resize_eval_groundtruth: `bool`, if True, eval groundtruth masks are
resized to output_size.
groundtruth_padded_size: `Tensor` or `list` for [height, width]. When
resize_eval_groundtruth is set to False, the groundtruth masks are
padded to this size.
ignore_label: `int` the pixel with ignore label will not used for training
and evaluation.
aug_rand_hflip: `bool`, if True, augment training with random
horizontal flip.
preserve_aspect_ratio: `bool`, if True, the aspect ratio is preserved,
otherwise, the image is resized to output_size.
aug_scale_min: `float`, the minimum scale applied to `output_size` for
data augmentation during training.
aug_scale_max: `float`, the maximum scale applied to `output_size` for
data augmentation during training.
dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
"""
self
.
_output_size
=
output_size
self
.
_crop_size
=
crop_size
self
.
_resize_eval_groundtruth
=
resize_eval_groundtruth
if
(
not
resize_eval_groundtruth
)
and
(
groundtruth_padded_size
is
None
):
raise
ValueError
(
'groundtruth_padded_size ([height, width]) needs to be'
'specified when resize_eval_groundtruth is False.'
)
self
.
_groundtruth_padded_size
=
groundtruth_padded_size
self
.
_ignore_label
=
ignore_label
self
.
_preserve_aspect_ratio
=
preserve_aspect_ratio
# Data augmentation.
self
.
_aug_rand_hflip
=
aug_rand_hflip
self
.
_aug_scale_min
=
aug_scale_min
self
.
_aug_scale_max
=
aug_scale_max
# dtype.
self
.
_dtype
=
dtype
def
_prepare_image_and_label
(
self
,
data
):
"""Prepare normalized image and label."""
image
=
tf
.
io
.
decode_image
(
data
[
'image/encoded'
],
channels
=
3
)
label
=
tf
.
io
.
decode_image
(
data
[
'image/segmentation/class/encoded'
],
channels
=
1
)
height
=
data
[
'image/height'
]
width
=
data
[
'image/width'
]
image
=
tf
.
reshape
(
image
,
(
height
,
width
,
3
))
label
=
tf
.
reshape
(
label
,
(
1
,
height
,
width
))
label
=
tf
.
cast
(
label
,
tf
.
float32
)
# Normalizes image with mean and std pixel values.
image
=
preprocess_ops
.
normalize_image
(
image
)
if
not
self
.
_preserve_aspect_ratio
:
label
=
tf
.
reshape
(
label
,
[
data
[
'image/height'
],
data
[
'image/width'
],
1
])
image
=
tf
.
image
.
resize
(
image
,
self
.
_output_size
,
method
=
'bilinear'
)
label
=
tf
.
image
.
resize
(
label
,
self
.
_output_size
,
method
=
'nearest'
)
label
=
tf
.
reshape
(
label
[:,
:,
-
1
],
[
1
]
+
self
.
_output_size
)
return
image
,
label
def
_parse_train_data
(
self
,
data
):
"""Parses data for training and evaluation."""
image
,
label
=
self
.
_prepare_image_and_label
(
data
)
if
self
.
_crop_size
:
label
=
tf
.
reshape
(
label
,
[
data
[
'image/height'
],
data
[
'image/width'
],
1
])
# If output_size is specified, resize image, and label to desired
# output_size.
if
self
.
_output_size
:
image
=
tf
.
image
.
resize
(
image
,
self
.
_output_size
,
method
=
'bilinear'
)
label
=
tf
.
image
.
resize
(
label
,
self
.
_output_size
,
method
=
'nearest'
)
image_mask
=
tf
.
concat
([
image
,
label
],
axis
=
2
)
image_mask_crop
=
tf
.
image
.
random_crop
(
image_mask
,
self
.
_crop_size
+
[
4
])
image
=
image_mask_crop
[:,
:,
:
-
1
]
label
=
tf
.
reshape
(
image_mask_crop
[:,
:,
-
1
],
[
1
]
+
self
.
_crop_size
)
# Flips image randomly during training.
if
self
.
_aug_rand_hflip
:
image
,
_
,
label
=
preprocess_ops
.
random_horizontal_flip
(
image
,
masks
=
label
)
train_image_size
=
self
.
_crop_size
if
self
.
_crop_size
else
self
.
_output_size
# Resizes and crops image.
image
,
image_info
=
preprocess_ops
.
resize_and_crop_image
(
image
,
train_image_size
,
train_image_size
,
aug_scale_min
=
self
.
_aug_scale_min
,
aug_scale_max
=
self
.
_aug_scale_max
)
# Resizes and crops boxes.
image_scale
=
image_info
[
2
,
:]
offset
=
image_info
[
3
,
:]
# Pad label and make sure the padded region assigned to the ignore label.
# The label is first offset by +1 and then padded with 0.
label
+=
1
label
=
tf
.
expand_dims
(
label
,
axis
=
3
)
label
=
preprocess_ops
.
resize_and_crop_masks
(
label
,
image_scale
,
train_image_size
,
offset
)
label
-=
1
label
=
tf
.
where
(
tf
.
equal
(
label
,
-
1
),
self
.
_ignore_label
*
tf
.
ones_like
(
label
),
label
)
label
=
tf
.
squeeze
(
label
,
axis
=
0
)
valid_mask
=
tf
.
not_equal
(
label
,
self
.
_ignore_label
)
labels
=
{
'masks'
:
label
,
'valid_masks'
:
valid_mask
,
'image_info'
:
image_info
,
}
# Cast image as self._dtype
image
=
tf
.
cast
(
image
,
dtype
=
self
.
_dtype
)
return
image
,
labels
def
_parse_eval_data
(
self
,
data
):
"""Parses data for training and evaluation."""
image
,
label
=
self
.
_prepare_image_and_label
(
data
)
# The label is first offset by +1 and then padded with 0.
label
+=
1
label
=
tf
.
expand_dims
(
label
,
axis
=
3
)
# Resizes and crops image.
image
,
image_info
=
preprocess_ops
.
resize_and_crop_image
(
image
,
self
.
_output_size
,
self
.
_output_size
)
if
self
.
_resize_eval_groundtruth
:
# Resizes eval masks to match input image sizes. In that case, mean IoU
# is computed on output_size not the original size of the images.
image_scale
=
image_info
[
2
,
:]
offset
=
image_info
[
3
,
:]
label
=
preprocess_ops
.
resize_and_crop_masks
(
label
,
image_scale
,
self
.
_output_size
,
offset
)
else
:
label
=
tf
.
image
.
pad_to_bounding_box
(
label
,
0
,
0
,
self
.
_groundtruth_padded_size
[
0
],
self
.
_groundtruth_padded_size
[
1
])
label
-=
1
label
=
tf
.
where
(
tf
.
equal
(
label
,
-
1
),
self
.
_ignore_label
*
tf
.
ones_like
(
label
),
label
)
label
=
tf
.
squeeze
(
label
,
axis
=
0
)
valid_mask
=
tf
.
not_equal
(
label
,
self
.
_ignore_label
)
labels
=
{
'masks'
:
label
,
'valid_masks'
:
valid_mask
,
'image_info'
:
image_info
}
# Cast image as self._dtype
image
=
tf
.
cast
(
image
,
dtype
=
self
.
_dtype
)
return
image
,
labels
Prev
1
2
3
4
5
6
7
8
…
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment