Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
482823c8
Commit
482823c8
authored
Sep 28, 2021
by
A. Unique TensorFlower
Browse files
Merge pull request #10263 from PurdueDualityLab:dataload_pr
PiperOrigin-RevId: 399483092
parents
61f8185d
77aa3ea9
Changes
25
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1800 additions
and
623 deletions
+1800
-623
official/vision/beta/projects/yolo/configs/backbones.py
official/vision/beta/projects/yolo/configs/backbones.py
+4
-7
official/vision/beta/projects/yolo/configs/darknet_classification.py
...sion/beta/projects/yolo/configs/darknet_classification.py
+3
-2
official/vision/beta/projects/yolo/dataloaders/classification_input.py
...on/beta/projects/yolo/dataloaders/classification_input.py
+92
-0
official/vision/beta/projects/yolo/dataloaders/classification_tfds_decoder.py
.../projects/yolo/dataloaders/classification_tfds_decoder.py
+0
-34
official/vision/beta/projects/yolo/dataloaders/tf_example_decoder.py
...sion/beta/projects/yolo/dataloaders/tf_example_decoder.py
+119
-0
official/vision/beta/projects/yolo/dataloaders/yolo_detection_input.py
...on/beta/projects/yolo/dataloaders/yolo_detection_input.py
+0
-319
official/vision/beta/projects/yolo/dataloaders/yolo_detection_input_test.py
...ta/projects/yolo/dataloaders/yolo_detection_input_test.py
+0
-103
official/vision/beta/projects/yolo/dataloaders/yolo_input.py
official/vision/beta/projects/yolo/dataloaders/yolo_input.py
+363
-0
official/vision/beta/projects/yolo/losses/yolo_loss.py
official/vision/beta/projects/yolo/losses/yolo_loss.py
+50
-45
official/vision/beta/projects/yolo/losses/yolo_loss_test.py
official/vision/beta/projects/yolo/losses/yolo_loss_test.py
+6
-6
official/vision/beta/projects/yolo/modeling/backbones/darknet.py
...l/vision/beta/projects/yolo/modeling/backbones/darknet.py
+20
-11
official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
...sion/beta/projects/yolo/modeling/decoders/yolo_decoder.py
+13
-1
official/vision/beta/projects/yolo/modeling/heads/yolo_head.py
...ial/vision/beta/projects/yolo/modeling/heads/yolo_head.py
+5
-1
official/vision/beta/projects/yolo/modeling/layers/detection_generator.py
...beta/projects/yolo/modeling/layers/detection_generator.py
+13
-23
official/vision/beta/projects/yolo/modeling/layers/detection_generator_test.py
...projects/yolo/modeling/layers/detection_generator_test.py
+9
-9
official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py
...al/vision/beta/projects/yolo/modeling/layers/nn_blocks.py
+191
-25
official/vision/beta/projects/yolo/modeling/yolo_model.py
official/vision/beta/projects/yolo/modeling/yolo_model.py
+4
-5
official/vision/beta/projects/yolo/ops/anchor.py
official/vision/beta/projects/yolo/ops/anchor.py
+481
-0
official/vision/beta/projects/yolo/ops/loss_utils.py
official/vision/beta/projects/yolo/ops/loss_utils.py
+21
-32
official/vision/beta/projects/yolo/ops/mosaic.py
official/vision/beta/projects/yolo/ops/mosaic.py
+406
-0
No files found.
official/vision/beta/projects/yolo/configs/backbones.py
View file @
482823c8
...
...
@@ -12,26 +12,23 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Backbones configurations."""
import
dataclasses
from
official.modeling
import
hyperparams
from
official.vision.beta.configs
import
backbones
@
dataclasses
.
dataclass
class
Darknet
(
hyperparams
.
Config
):
"""Dark
n
et config."""
model_id
:
str
=
'darknet53'
"""Dark
N
et config."""
model_id
:
str
=
'
csp
darknet53'
width_scale
:
float
=
1.0
depth_scale
:
float
=
1.0
dilate
:
bool
=
False
min_level
:
int
=
3
max_level
:
int
=
5
use_separable_conv
:
bool
=
False
use_reorg_input
:
bool
=
False
@
dataclasses
.
dataclass
...
...
official/vision/beta/projects/yolo/configs/darknet_classification.py
View file @
482823c8
...
...
@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Image classification with darknet configs."""
import
dataclasses
...
...
@@ -28,14 +27,16 @@ from official.vision.beta.projects.yolo.configs import backbones
@
dataclasses
.
dataclass
class
ImageClassificationModel
(
hyperparams
.
Config
):
"""Image classification model config."""
num_classes
:
int
=
0
input_size
:
List
[
int
]
=
dataclasses
.
field
(
default_factory
=
l
ist
)
input_size
:
List
[
int
]
=
dataclasses
.
field
(
default_factory
=
l
ambda
:
[
224
,
224
]
)
backbone
:
backbones
.
Backbone
=
backbones
.
Backbone
(
type
=
'darknet'
,
darknet
=
backbones
.
Darknet
())
dropout_rate
:
float
=
0.0
norm_activation
:
common
.
NormActivation
=
common
.
NormActivation
()
# Adds a Batch Normalization layer pre-GlobalAveragePooling in classification.
add_head_batch_norm
:
bool
=
False
kernel_initializer
:
str
=
'VarianceScaling'
@
dataclasses
.
dataclass
...
...
official/vision/beta/projects/yolo/dataloaders/classification_input.py
0 → 100755
View file @
482823c8
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Classification decoder and parser."""
import
tensorflow
as
tf
from
official.vision.beta.dataloaders
import
classification_input
from
official.vision.beta.ops
import
preprocess_ops
class
Parser
(
classification_input
.
Parser
):
"""Parser to parse an image and its annotations into a dictionary of tensors."""
def
_parse_train_image
(
self
,
decoded_tensors
):
"""Parses image data for training."""
image_bytes
=
decoded_tensors
[
self
.
_image_field_key
]
if
self
.
_decode_jpeg_only
:
image_shape
=
tf
.
image
.
extract_jpeg_shape
(
image_bytes
)
# Crops image.
cropped_image
=
preprocess_ops
.
random_crop_image_v2
(
image_bytes
,
image_shape
)
image
=
tf
.
cond
(
tf
.
reduce_all
(
tf
.
equal
(
tf
.
shape
(
cropped_image
),
image_shape
)),
lambda
:
preprocess_ops
.
center_crop_image_v2
(
image_bytes
,
image_shape
),
lambda
:
cropped_image
)
else
:
# Decodes image.
image
=
tf
.
io
.
decode_image
(
image_bytes
,
channels
=
3
)
image
.
set_shape
([
None
,
None
,
3
])
# Crops image.
cropped_image
=
preprocess_ops
.
random_crop_image
(
image
)
image
=
tf
.
cond
(
tf
.
reduce_all
(
tf
.
equal
(
tf
.
shape
(
cropped_image
),
tf
.
shape
(
image
))),
lambda
:
preprocess_ops
.
center_crop_image
(
image
),
lambda
:
cropped_image
)
if
self
.
_aug_rand_hflip
:
image
=
tf
.
image
.
random_flip_left_right
(
image
)
# Resizes image.
image
=
tf
.
image
.
resize
(
image
,
self
.
_output_size
,
method
=
tf
.
image
.
ResizeMethod
.
BILINEAR
)
image
.
set_shape
([
self
.
_output_size
[
0
],
self
.
_output_size
[
1
],
3
])
# Apply autoaug or randaug.
if
self
.
_augmenter
is
not
None
:
image
=
self
.
_augmenter
.
distort
(
image
)
# Convert image to self._dtype.
image
=
tf
.
image
.
convert_image_dtype
(
image
,
self
.
_dtype
)
image
=
image
/
255.0
return
image
def
_parse_eval_image
(
self
,
decoded_tensors
):
"""Parses image data for evaluation."""
image_bytes
=
decoded_tensors
[
self
.
_image_field_key
]
if
self
.
_decode_jpeg_only
:
image_shape
=
tf
.
image
.
extract_jpeg_shape
(
image_bytes
)
# Center crops.
image
=
preprocess_ops
.
center_crop_image_v2
(
image_bytes
,
image_shape
)
else
:
# Decodes image.
image
=
tf
.
io
.
decode_image
(
image_bytes
,
channels
=
3
)
image
.
set_shape
([
None
,
None
,
3
])
# Center crops.
image
=
preprocess_ops
.
center_crop_image
(
image
)
image
=
tf
.
image
.
resize
(
image
,
self
.
_output_size
,
method
=
tf
.
image
.
ResizeMethod
.
BILINEAR
)
image
.
set_shape
([
self
.
_output_size
[
0
],
self
.
_output_size
[
1
],
3
])
# Convert image to self._dtype.
image
=
tf
.
image
.
convert_image_dtype
(
image
,
self
.
_dtype
)
image
=
image
/
255.0
return
image
official/vision/beta/projects/yolo/dataloaders/classification_tfds_decoder.py
deleted
100644 → 0
View file @
61f8185d
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""TFDS Classification decoder."""
import
tensorflow
as
tf
from
official.vision.beta.dataloaders
import
decoder
class
Decoder
(
decoder
.
Decoder
):
"""A tf.Example decoder for classification task."""
def
__init__
(
self
):
return
def
decode
(
self
,
serialized_example
):
sample_dict
=
{
'image/encoded'
:
tf
.
io
.
encode_jpeg
(
serialized_example
[
'image'
],
quality
=
100
),
'image/class/label'
:
serialized_example
[
'label'
],
}
return
sample_dict
official/vision/beta/projects/yolo/dataloaders/tf_example_decoder.py
0 → 100644
View file @
482823c8
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tensorflow Example proto decoder for object detection.
A decoder to decode string tensors containing serialized tensorflow.Example
protos for object detection.
"""
import
tensorflow
as
tf
from
official.vision.beta.dataloaders
import
tf_example_decoder
def
_coco91_to_80
(
classif
,
box
,
areas
,
iscrowds
):
"""Function used to reduce COCO 91 to COCO 80 (2017 to 2014 format)."""
# Vector where index i coralates to the class at index[i].
class_ids
=
[
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
27
,
28
,
31
,
32
,
33
,
34
,
35
,
36
,
37
,
38
,
39
,
40
,
41
,
42
,
43
,
44
,
46
,
47
,
48
,
49
,
50
,
51
,
52
,
53
,
54
,
55
,
56
,
57
,
58
,
59
,
60
,
61
,
62
,
63
,
64
,
65
,
67
,
70
,
72
,
73
,
74
,
75
,
76
,
77
,
78
,
79
,
80
,
81
,
82
,
84
,
85
,
86
,
87
,
88
,
89
,
90
]
new_classes
=
tf
.
expand_dims
(
tf
.
convert_to_tensor
(
class_ids
),
axis
=
0
)
# Resahpe the classes to in order to build a class mask.
classes
=
tf
.
expand_dims
(
classif
,
axis
=-
1
)
# One hot the classificiations to match the 80 class format.
ind
=
classes
==
tf
.
cast
(
new_classes
,
classes
.
dtype
)
# Select the max values.
selected_class
=
tf
.
reshape
(
tf
.
math
.
argmax
(
tf
.
cast
(
ind
,
tf
.
float32
),
axis
=-
1
),
[
-
1
])
ind
=
tf
.
where
(
tf
.
reduce_any
(
ind
,
axis
=-
1
))
# Gather the valuable instances.
classif
=
tf
.
gather_nd
(
selected_class
,
ind
)
box
=
tf
.
gather_nd
(
box
,
ind
)
areas
=
tf
.
gather_nd
(
areas
,
ind
)
iscrowds
=
tf
.
gather_nd
(
iscrowds
,
ind
)
# Restate the number of viable detections, ideally it should be the same.
num_detections
=
tf
.
shape
(
classif
)[
0
]
return
classif
,
box
,
areas
,
iscrowds
,
num_detections
class
TfExampleDecoder
(
tf_example_decoder
.
TfExampleDecoder
):
"""Tensorflow Example proto decoder."""
def
__init__
(
self
,
coco91_to_80
=
None
,
include_mask
=
False
,
regenerate_source_id
=
False
,
mask_binarize_threshold
=
None
):
"""Initialize the example decoder.
Args:
coco91_to_80: `bool` indicating whether to convert coco from its 91 class
format to the 80 class format.
include_mask: `bool` indicating if the decoder should also decode instance
masks for instance segmentation.
regenerate_source_id: `bool` indicating if the source id needs to be
recreated for each image sample.
mask_binarize_threshold: `float` for binarizing mask values.
"""
if
coco91_to_80
and
include_mask
:
raise
ValueError
(
'If masks are included you cannot convert coco from the'
'91 class format to the 80 class format.'
)
self
.
_coco91_to_80
=
coco91_to_80
super
().
__init__
(
include_mask
=
include_mask
,
regenerate_source_id
=
regenerate_source_id
,
mask_binarize_threshold
=
mask_binarize_threshold
)
def
decode
(
self
,
serialized_example
):
"""Decode the serialized example.
Args:
serialized_example: a single serialized tf.Example string.
Returns:
decoded_tensors: a dictionary of tensors with the following fields:
- source_id: a string scalar tensor.
- image: a uint8 tensor of shape [None, None, 3].
- height: an integer scalar tensor.
- width: an integer scalar tensor.
- groundtruth_classes: a int64 tensor of shape [None].
- groundtruth_is_crowd: a bool tensor of shape [None].
- groundtruth_area: a float32 tensor of shape [None].
- groundtruth_boxes: a float32 tensor of shape [None, 4].
- groundtruth_instance_masks: a float32 tensor of shape
[None, None, None].
- groundtruth_instance_masks_png: a string tensor of shape [None].
"""
decoded_tensors
=
super
().
decode
(
serialized_example
)
if
self
.
_coco91_to_80
:
(
decoded_tensors
[
'groundtruth_classes'
],
decoded_tensors
[
'groundtruth_boxes'
],
decoded_tensors
[
'groundtruth_area'
],
decoded_tensors
[
'groundtruth_is_crowd'
],
_
)
=
_coco91_to_80
(
decoded_tensors
[
'groundtruth_classes'
],
decoded_tensors
[
'groundtruth_boxes'
],
decoded_tensors
[
'groundtruth_area'
],
decoded_tensors
[
'groundtruth_is_crowd'
])
return
decoded_tensors
official/vision/beta/projects/yolo/dataloaders/yolo_detection_input.py
deleted
100644 → 0
View file @
61f8185d
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Detection Data parser and processing for YOLO.
Parse image and ground truths in a dataset to training targets and package them
into (image, labels) tuple for RetinaNet.
"""
import
tensorflow
as
tf
from
official.vision.beta.dataloaders
import
parser
from
official.vision.beta.ops
import
box_ops
from
official.vision.beta.ops
import
preprocess_ops
from
official.vision.beta.projects.yolo.ops
import
box_ops
as
yolo_box_ops
from
official.vision.beta.projects.yolo.ops
import
preprocess_ops
as
yolo_preprocess_ops
class
Parser
(
parser
.
Parser
):
"""Parser to parse an image and its annotations into a dictionary of tensors."""
def
__init__
(
self
,
output_size
,
num_classes
,
fixed_size
=
True
,
jitter_im
=
0.1
,
jitter_boxes
=
0.005
,
use_tie_breaker
=
True
,
min_level
=
3
,
max_level
=
5
,
masks
=
None
,
max_process_size
=
608
,
min_process_size
=
320
,
max_num_instances
=
200
,
random_flip
=
True
,
aug_rand_saturation
=
True
,
aug_rand_brightness
=
True
,
aug_rand_zoom
=
True
,
aug_rand_hue
=
True
,
anchors
=
None
,
seed
=
10
,
dtype
=
tf
.
float32
):
"""Initializes parameters for parsing annotations in the dataset.
Args:
output_size: a `Tuple` for (width, height) of input image.
num_classes: a `Tensor` or `int` for the number of classes.
fixed_size: a `bool` if True all output images have the same size.
jitter_im: a `float` representing a pixel value that is the maximum jitter
applied to the image for data augmentation during training.
jitter_boxes: a `float` representing a pixel value that is the maximum
jitter applied to the bounding box for data augmentation during
training.
use_tie_breaker: boolean value for wether or not to use the tie_breaker.
min_level: `int` number of minimum level of the output feature pyramid.
max_level: `int` number of maximum level of the output feature pyramid.
masks: a `Tensor`, `List` or `numpy.ndarray` for anchor masks.
max_process_size: an `int` for maximum image width and height.
min_process_size: an `int` for minimum image width and height.
max_num_instances: an `int` number of maximum number of instances in an
image.
random_flip: a `bool` if True, augment training with random horizontal
flip.
aug_rand_saturation: `bool`, if True, augment training with random
saturation.
aug_rand_brightness: `bool`, if True, augment training with random
brightness.
aug_rand_zoom: `bool`, if True, augment training with random zoom.
aug_rand_hue: `bool`, if True, augment training with random hue.
anchors: a `Tensor`, `List` or `numpy.ndarrray` for bounding box priors.
seed: an `int` for the seed used by tf.random
dtype: a `tf.dtypes.DType` object that represents the dtype the outputs
will be casted to. The available types are tf.float32, tf.float16, or
tf.bfloat16.
"""
self
.
_net_down_scale
=
2
**
max_level
self
.
_num_classes
=
num_classes
self
.
_image_w
=
(
output_size
[
0
]
//
self
.
_net_down_scale
)
*
self
.
_net_down_scale
self
.
_image_h
=
(
output_size
[
1
]
//
self
.
_net_down_scale
)
*
self
.
_net_down_scale
self
.
_max_process_size
=
max_process_size
self
.
_min_process_size
=
min_process_size
self
.
_fixed_size
=
fixed_size
self
.
_anchors
=
anchors
self
.
_masks
=
{
key
:
tf
.
convert_to_tensor
(
value
)
for
key
,
value
in
masks
.
items
()
}
self
.
_use_tie_breaker
=
use_tie_breaker
self
.
_jitter_im
=
0.0
if
jitter_im
is
None
else
jitter_im
self
.
_jitter_boxes
=
0.0
if
jitter_boxes
is
None
else
jitter_boxes
self
.
_max_num_instances
=
max_num_instances
self
.
_random_flip
=
random_flip
self
.
_aug_rand_saturation
=
aug_rand_saturation
self
.
_aug_rand_brightness
=
aug_rand_brightness
self
.
_aug_rand_zoom
=
aug_rand_zoom
self
.
_aug_rand_hue
=
aug_rand_hue
self
.
_seed
=
seed
self
.
_dtype
=
dtype
def
_build_grid
(
self
,
raw_true
,
width
,
batch
=
False
,
use_tie_breaker
=
False
):
mask
=
self
.
_masks
for
key
in
self
.
_masks
.
keys
():
if
not
batch
:
mask
[
key
]
=
yolo_preprocess_ops
.
build_grided_gt
(
raw_true
,
self
.
_masks
[
key
],
width
//
2
**
int
(
key
),
raw_true
[
'bbox'
].
dtype
,
use_tie_breaker
)
else
:
mask
[
key
]
=
yolo_preprocess_ops
.
build_batch_grided_gt
(
raw_true
,
self
.
_masks
[
key
],
width
//
2
**
int
(
key
),
raw_true
[
'bbox'
].
dtype
,
use_tie_breaker
)
return
mask
def
_parse_train_data
(
self
,
data
):
"""Generates images and labels that are usable for model training.
Args:
data: a dict of Tensors produced by the decoder.
Returns:
images: the image tensor.
labels: a dict of Tensors that contains labels.
"""
shape
=
tf
.
shape
(
data
[
'image'
])
image
=
data
[
'image'
]
/
255
boxes
=
data
[
'groundtruth_boxes'
]
width
=
shape
[
0
]
height
=
shape
[
1
]
image
,
boxes
=
yolo_preprocess_ops
.
fit_preserve_aspect_ratio
(
image
,
boxes
,
width
=
width
,
height
=
height
,
target_dim
=
self
.
_max_process_size
)
image_shape
=
tf
.
shape
(
image
)[:
2
]
if
self
.
_random_flip
:
image
,
boxes
,
_
=
preprocess_ops
.
random_horizontal_flip
(
image
,
boxes
,
seed
=
self
.
_seed
)
randscale
=
self
.
_image_w
//
self
.
_net_down_scale
if
not
self
.
_fixed_size
:
do_scale
=
tf
.
greater
(
tf
.
random
.
uniform
([],
minval
=
0
,
maxval
=
1
,
seed
=
self
.
_seed
),
0.5
)
if
do_scale
:
# This scales the image to a random multiple of net_down_scale
# between 320 to 608
randscale
=
tf
.
random
.
uniform
(
[],
minval
=
self
.
_min_process_size
//
self
.
_net_down_scale
,
maxval
=
self
.
_max_process_size
//
self
.
_net_down_scale
,
seed
=
self
.
_seed
,
dtype
=
tf
.
int32
)
*
self
.
_net_down_scale
if
self
.
_jitter_boxes
!=
0.0
:
boxes
=
box_ops
.
denormalize_boxes
(
boxes
,
image_shape
)
boxes
=
box_ops
.
jitter_boxes
(
boxes
,
0.025
)
boxes
=
box_ops
.
normalize_boxes
(
boxes
,
image_shape
)
# YOLO loss function uses x-center, y-center format
boxes
=
yolo_box_ops
.
yxyx_to_xcycwh
(
boxes
)
if
self
.
_jitter_im
!=
0.0
:
image
,
boxes
=
yolo_preprocess_ops
.
random_translate
(
image
,
boxes
,
self
.
_jitter_im
,
seed
=
self
.
_seed
)
if
self
.
_aug_rand_zoom
:
image
,
boxes
=
yolo_preprocess_ops
.
resize_crop_filter
(
image
,
boxes
,
default_width
=
self
.
_image_w
,
default_height
=
self
.
_image_h
,
target_width
=
randscale
,
target_height
=
randscale
)
image
=
tf
.
image
.
resize
(
image
,
(
416
,
416
),
preserve_aspect_ratio
=
False
)
if
self
.
_aug_rand_brightness
:
image
=
tf
.
image
.
random_brightness
(
image
=
image
,
max_delta
=
.
1
)
# Brightness
if
self
.
_aug_rand_saturation
:
image
=
tf
.
image
.
random_saturation
(
image
=
image
,
lower
=
0.75
,
upper
=
1.25
)
# Saturation
if
self
.
_aug_rand_hue
:
image
=
tf
.
image
.
random_hue
(
image
=
image
,
max_delta
=
.
3
)
# Hue
image
=
tf
.
clip_by_value
(
image
,
0.0
,
1.0
)
# Find the best anchor for the ground truth labels to maximize the iou
best_anchors
=
yolo_preprocess_ops
.
get_best_anchor
(
boxes
,
self
.
_anchors
,
width
=
self
.
_image_w
,
height
=
self
.
_image_h
)
# Padding
boxes
=
preprocess_ops
.
clip_or_pad_to_fixed_size
(
boxes
,
self
.
_max_num_instances
,
0
)
classes
=
preprocess_ops
.
clip_or_pad_to_fixed_size
(
data
[
'groundtruth_classes'
],
self
.
_max_num_instances
,
-
1
)
best_anchors
=
preprocess_ops
.
clip_or_pad_to_fixed_size
(
best_anchors
,
self
.
_max_num_instances
,
0
)
area
=
preprocess_ops
.
clip_or_pad_to_fixed_size
(
data
[
'groundtruth_area'
],
self
.
_max_num_instances
,
0
)
is_crowd
=
preprocess_ops
.
clip_or_pad_to_fixed_size
(
tf
.
cast
(
data
[
'groundtruth_is_crowd'
],
tf
.
int32
),
self
.
_max_num_instances
,
0
)
labels
=
{
'source_id'
:
data
[
'source_id'
],
'bbox'
:
tf
.
cast
(
boxes
,
self
.
_dtype
),
'classes'
:
tf
.
cast
(
classes
,
self
.
_dtype
),
'area'
:
tf
.
cast
(
area
,
self
.
_dtype
),
'is_crowd'
:
is_crowd
,
'best_anchors'
:
tf
.
cast
(
best_anchors
,
self
.
_dtype
),
'width'
:
width
,
'height'
:
height
,
'num_detections'
:
tf
.
shape
(
data
[
'groundtruth_classes'
])[
0
],
}
if
self
.
_fixed_size
:
grid
=
self
.
_build_grid
(
labels
,
self
.
_image_w
,
use_tie_breaker
=
self
.
_use_tie_breaker
)
labels
.
update
({
'grid_form'
:
grid
})
return
image
,
labels
def
_parse_eval_data
(
self
,
data
):
"""Generates images and labels that are usable for model training.
Args:
data: a dict of Tensors produced by the decoder.
Returns:
images: the image tensor.
labels: a dict of Tensors that contains labels.
"""
shape
=
tf
.
shape
(
data
[
'image'
])
image
=
data
[
'image'
]
/
255
boxes
=
data
[
'groundtruth_boxes'
]
width
=
shape
[
0
]
height
=
shape
[
1
]
image
,
boxes
=
yolo_preprocess_ops
.
fit_preserve_aspect_ratio
(
image
,
boxes
,
width
=
width
,
height
=
height
,
target_dim
=
self
.
_image_w
)
boxes
=
yolo_box_ops
.
yxyx_to_xcycwh
(
boxes
)
# Find the best anchor for the ground truth labels to maximize the iou
best_anchors
=
yolo_preprocess_ops
.
get_best_anchor
(
boxes
,
self
.
_anchors
,
width
=
self
.
_image_w
,
height
=
self
.
_image_h
)
boxes
=
yolo_preprocess_ops
.
pad_max_instances
(
boxes
,
self
.
_max_num_instances
,
0
)
classes
=
yolo_preprocess_ops
.
pad_max_instances
(
data
[
'groundtruth_classes'
],
self
.
_max_num_instances
,
0
)
best_anchors
=
yolo_preprocess_ops
.
pad_max_instances
(
best_anchors
,
self
.
_max_num_instances
,
0
)
area
=
yolo_preprocess_ops
.
pad_max_instances
(
data
[
'groundtruth_area'
],
self
.
_max_num_instances
,
0
)
is_crowd
=
yolo_preprocess_ops
.
pad_max_instances
(
tf
.
cast
(
data
[
'groundtruth_is_crowd'
],
tf
.
int32
),
self
.
_max_num_instances
,
0
)
labels
=
{
'source_id'
:
data
[
'source_id'
],
'bbox'
:
tf
.
cast
(
boxes
,
self
.
_dtype
),
'classes'
:
tf
.
cast
(
classes
,
self
.
_dtype
),
'area'
:
tf
.
cast
(
area
,
self
.
_dtype
),
'is_crowd'
:
is_crowd
,
'best_anchors'
:
tf
.
cast
(
best_anchors
,
self
.
_dtype
),
'width'
:
width
,
'height'
:
height
,
'num_detections'
:
tf
.
shape
(
data
[
'groundtruth_classes'
])[
0
],
}
grid
=
self
.
_build_grid
(
labels
,
self
.
_image_w
,
batch
=
False
,
use_tie_breaker
=
self
.
_use_tie_breaker
)
labels
.
update
({
'grid_form'
:
grid
})
return
image
,
labels
def
_postprocess_fn
(
self
,
image
,
label
):
randscale
=
self
.
_image_w
//
self
.
_net_down_scale
if
not
self
.
_fixed_size
:
do_scale
=
tf
.
greater
(
tf
.
random
.
uniform
([],
minval
=
0
,
maxval
=
1
,
seed
=
self
.
_seed
),
0.5
)
if
do_scale
:
# This scales the image to a random multiple of net_down_scale
# between 320 to 608
randscale
=
tf
.
random
.
uniform
(
[],
minval
=
self
.
_min_process_size
//
self
.
_net_down_scale
,
maxval
=
self
.
_max_process_size
//
self
.
_net_down_scale
,
seed
=
self
.
_seed
,
dtype
=
tf
.
int32
)
*
self
.
_net_down_scale
width
=
randscale
image
=
tf
.
image
.
resize
(
image
,
(
width
,
width
))
grid
=
self
.
_build_grid
(
label
,
width
,
batch
=
True
,
use_tie_breaker
=
self
.
_use_tie_breaker
)
label
.
update
({
'grid_form'
:
grid
})
return
image
,
label
def
postprocess_fn
(
self
,
is_training
=
True
):
return
self
.
_postprocess_fn
if
not
self
.
_fixed_size
and
is_training
else
None
official/vision/beta/projects/yolo/dataloaders/yolo_detection_input_test.py
deleted
100644 → 0
View file @
61f8185d
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Test case for YOLO detection dataloader configuration definition."""
from
absl.testing
import
parameterized
import
dataclasses
import
tensorflow
as
tf
from
official.core
import
config_definitions
as
cfg
from
official.core
import
input_reader
from
official.modeling
import
hyperparams
from
official.vision.beta.dataloaders
import
tfds_detection_decoders
from
official.vision.beta.projects.yolo.dataloaders
import
yolo_detection_input
@
dataclasses
.
dataclass
class
Parser
(
hyperparams
.
Config
):
"""Dummy configuration for parser."""
output_size
:
int
=
(
416
,
416
)
num_classes
:
int
=
80
fixed_size
:
bool
=
True
jitter_im
:
float
=
0.1
jitter_boxes
:
float
=
0.005
min_process_size
:
int
=
320
max_process_size
:
int
=
608
max_num_instances
:
int
=
200
random_flip
:
bool
=
True
seed
:
int
=
10
shuffle_buffer_size
:
int
=
10000
@
dataclasses
.
dataclass
class
DataConfig
(
cfg
.
DataConfig
):
"""Input config for training."""
input_path
:
str
=
''
tfds_name
:
str
=
'coco/2017'
tfds_split
:
str
=
'train'
global_batch_size
:
int
=
10
is_training
:
bool
=
True
dtype
:
str
=
'float16'
decoder
=
None
parser
:
Parser
=
Parser
()
shuffle_buffer_size
:
int
=
10
class
YoloDetectionInputTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
@
parameterized
.
named_parameters
((
'training'
,
True
),
(
'testing'
,
False
))
def
test_yolo_input
(
self
,
is_training
):
params
=
DataConfig
(
is_training
=
is_training
)
decoder
=
tfds_detection_decoders
.
MSCOCODecoder
()
anchors
=
[[
12.0
,
19.0
],
[
31.0
,
46.0
],
[
96.0
,
54.0
],
[
46.0
,
114.0
],
[
133.0
,
127.0
],
[
79.0
,
225.0
],
[
301.0
,
150.0
],
[
172.0
,
286.0
],
[
348.0
,
340.0
]]
masks
=
{
'3'
:
[
0
,
1
,
2
],
'4'
:
[
3
,
4
,
5
],
'5'
:
[
6
,
7
,
8
]}
parser
=
yolo_detection_input
.
Parser
(
output_size
=
params
.
parser
.
output_size
,
num_classes
=
params
.
parser
.
num_classes
,
fixed_size
=
params
.
parser
.
fixed_size
,
jitter_im
=
params
.
parser
.
jitter_im
,
jitter_boxes
=
params
.
parser
.
jitter_boxes
,
min_process_size
=
params
.
parser
.
min_process_size
,
max_process_size
=
params
.
parser
.
max_process_size
,
max_num_instances
=
params
.
parser
.
max_num_instances
,
random_flip
=
params
.
parser
.
random_flip
,
seed
=
params
.
parser
.
seed
,
anchors
=
anchors
,
masks
=
masks
)
postprocess_fn
=
parser
.
postprocess_fn
(
is_training
=
is_training
)
reader
=
input_reader
.
InputReader
(
params
,
dataset_fn
=
tf
.
data
.
TFRecordDataset
,
decoder_fn
=
decoder
.
decode
,
parser_fn
=
parser
.
parse_fn
(
params
.
is_training
))
dataset
=
reader
.
read
(
input_context
=
None
).
batch
(
10
).
take
(
1
)
if
postprocess_fn
:
image
,
_
=
postprocess_fn
(
*
tf
.
data
.
experimental
.
get_single_element
(
dataset
))
else
:
image
,
_
=
tf
.
data
.
experimental
.
get_single_element
(
dataset
)
print
(
image
.
shape
)
self
.
assertAllEqual
(
image
.
shape
,
(
10
,
10
,
416
,
416
,
3
))
self
.
assertTrue
(
tf
.
reduce_all
(
tf
.
math
.
logical_and
(
image
>=
0
,
image
<=
1
)))
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/beta/projects/yolo/dataloaders/yolo_input.py
0 → 100755
View file @
482823c8
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Detection Data parser and processing for YOLO."""
import
tensorflow
as
tf
from
official.vision.beta.dataloaders
import
parser
from
official.vision.beta.dataloaders
import
utils
from
official.vision.beta.ops
import
box_ops
as
bbox_ops
from
official.vision.beta.ops
import
preprocess_ops
from
official.vision.beta.projects.yolo.ops
import
anchor
from
official.vision.beta.projects.yolo.ops
import
preprocessing_ops
class
Parser
(
parser
.
Parser
):
"""Parse the dataset in to the YOLO model format."""
def
__init__
(
self
,
output_size
,
anchors
,
expanded_strides
,
level_limits
=
None
,
max_num_instances
=
200
,
area_thresh
=
0.1
,
aug_rand_hue
=
1.0
,
aug_rand_saturation
=
1.0
,
aug_rand_brightness
=
1.0
,
letter_box
=
False
,
random_pad
=
True
,
random_flip
=
True
,
jitter
=
0.0
,
aug_scale_min
=
1.0
,
aug_scale_max
=
1.0
,
aug_rand_translate
=
0.0
,
aug_rand_perspective
=
0.0
,
aug_rand_angle
=
0.0
,
anchor_t
=
4.0
,
scale_xy
=
None
,
best_match_only
=
False
,
darknet
=
False
,
use_tie_breaker
=
True
,
dtype
=
'float32'
,
seed
=
None
):
"""Initializes parameters for parsing annotations in the dataset.
Args:
output_size: `Tensor` or `List` for [height, width] of output image. The
output_size should be divided by the largest feature stride 2^max_level.
anchors: `Dict[List[Union[int, float]]]` of anchor boxes to be bes used in
each level.
expanded_strides: `Dict[int]` for how much the model scales down the
images at the largest level. For example, level 3 down samples the image
by a factor of 16, in the expanded strides dictionary, we will pass
along {3: 16} indicating that relative to the original image, the shapes
must be reduced by a factor of 16 to compute the loss.
level_limits: `List` the box sizes that will be allowed at each FPN level
as is done in the FCOS and YOLOX paper for anchor free box assignment.
max_num_instances: `int` for the number of boxes to compute loss on.
area_thresh: `float` for the minimum area of a box to allow to pass
through for optimization.
aug_rand_hue: `float` indicating the maximum scaling value for hue.
saturation will be scaled between 1 - value and 1 + value.
aug_rand_saturation: `float` indicating the maximum scaling value for
saturation. saturation will be scaled between 1/value and value.
aug_rand_brightness: `float` indicating the maximum scaling value for
brightness. brightness will be scaled between 1/value and value.
letter_box: `boolean` indicating whether upon start of the datapipeline
regardless of the preprocessing ops that are used, the aspect ratio of
the images should be preserved.
random_pad: `bool` indiccating wether to use padding to apply random
translation true for darknet yolo false for scaled yolo.
random_flip: `boolean` indicating whether or not to randomly flip the
image horizontally.
jitter: `float` for the maximum change in aspect ratio expected in each
preprocessing step.
aug_scale_min: `float` indicating the minimum scaling value for image
scale jitter.
aug_scale_max: `float` indicating the maximum scaling value for image
scale jitter.
aug_rand_translate: `float` ranging from 0 to 1 indicating the maximum
amount to randomly translate an image.
aug_rand_perspective: `float` ranging from 0.000 to 0.001 indicating how
much to prespective warp the image.
aug_rand_angle: `float` indicating the maximum angle value for angle.
angle will be changes between 0 and value.
anchor_t: `float` indicating the threshold over which an anchor will be
considered for prediction, at zero, all the anchors will be used and at
1.0 only the best will be used. for anchor thresholds larger than 1.0 we
stop using the IOU for anchor comparison and resort directly to
comparing the width and height, this is used for the scaled models.
scale_xy: dictionary `float` values inidcating how far each pixel can see
outside of its containment of 1.0. a value of 1.2 indicates there is a
20% extended radius around each pixel that this specific pixel can
predict values for a center at. the center can range from 0 - value/2 to
1 + value/2, this value is set in the yolo filter, and resused here.
there should be one value for scale_xy for each level from min_level to
max_level.
best_match_only: `boolean` indicating how boxes are selected for
optimization.
darknet: `boolean` indicating which data pipeline to use. Setting to True
swaps the pipeline to output images realtive to Yolov4 and older.
use_tie_breaker: `boolean` indicating whether to use the anchor threshold
value.
dtype: `str` indicating the output datatype of the datapipeline selecting
from {"float32", "float16", "bfloat16"}.
seed: `int` the seed for random number generation.
"""
for
key
in
anchors
:
# Assert that the width and height is viable
assert
output_size
[
1
]
%
expanded_strides
[
str
(
key
)]
==
0
assert
output_size
[
0
]
%
expanded_strides
[
str
(
key
)]
==
0
# Set the width and height properly and base init:
self
.
_image_w
=
output_size
[
1
]
self
.
_image_h
=
output_size
[
0
]
self
.
_max_num_instances
=
max_num_instances
# Image scaling params
self
.
_jitter
=
0.0
if
jitter
is
None
else
jitter
self
.
_aug_scale_min
=
aug_scale_min
self
.
_aug_scale_max
=
aug_scale_max
self
.
_aug_rand_translate
=
aug_rand_translate
self
.
_aug_rand_perspective
=
aug_rand_perspective
# Image spatial distortion
self
.
_random_flip
=
random_flip
self
.
_letter_box
=
letter_box
self
.
_random_pad
=
random_pad
self
.
_aug_rand_angle
=
aug_rand_angle
# Color space distortion of the image
self
.
_aug_rand_saturation
=
aug_rand_saturation
self
.
_aug_rand_brightness
=
aug_rand_brightness
self
.
_aug_rand_hue
=
aug_rand_hue
# Set the per level values needed for operation
self
.
_darknet
=
darknet
self
.
_area_thresh
=
area_thresh
self
.
_seed
=
seed
self
.
_dtype
=
dtype
self
.
_label_builder
=
anchor
.
YoloAnchorLabeler
(
anchors
=
anchors
,
anchor_free_level_limits
=
level_limits
,
level_strides
=
expanded_strides
,
center_radius
=
scale_xy
,
max_num_instances
=
max_num_instances
,
match_threshold
=
anchor_t
,
best_matches_only
=
best_match_only
,
use_tie_breaker
=
use_tie_breaker
,
darknet
=
darknet
,
dtype
=
dtype
)
def
_pad_infos_object
(
self
,
image
):
"""Get a Tensor to pad the info object list."""
shape_
=
tf
.
shape
(
image
)
val
=
tf
.
stack
([
tf
.
cast
(
shape_
[:
2
],
tf
.
float32
),
tf
.
cast
(
shape_
[:
2
],
tf
.
float32
),
tf
.
ones_like
(
tf
.
cast
(
shape_
[:
2
],
tf
.
float32
)),
tf
.
zeros_like
(
tf
.
cast
(
shape_
[:
2
],
tf
.
float32
)),
])
return
val
def
_jitter_scale
(
self
,
image
,
shape
,
letter_box
,
jitter
,
random_pad
,
aug_scale_min
,
aug_scale_max
,
translate
,
angle
,
perspective
):
"""Distort and scale each input image."""
infos
=
[]
if
(
aug_scale_min
!=
1.0
or
aug_scale_max
!=
1.0
):
crop_only
=
True
# jitter gives you only one info object, resize and crop gives you one,
# if crop only then there can be 1 form jitter and 1 from crop
infos
.
append
(
self
.
_pad_infos_object
(
image
))
else
:
crop_only
=
False
image
,
crop_info
,
_
=
preprocessing_ops
.
resize_and_jitter_image
(
image
,
shape
,
letter_box
=
letter_box
,
jitter
=
jitter
,
crop_only
=
crop_only
,
random_pad
=
random_pad
,
seed
=
self
.
_seed
,
)
infos
.
extend
(
crop_info
)
image
,
_
,
affine
=
preprocessing_ops
.
affine_warp_image
(
image
,
shape
,
scale_min
=
aug_scale_min
,
scale_max
=
aug_scale_max
,
translate
=
translate
,
degrees
=
angle
,
perspective
=
perspective
,
random_pad
=
random_pad
,
seed
=
self
.
_seed
,
)
return
image
,
infos
,
affine
def
_parse_train_data
(
self
,
data
):
"""Parses data for training."""
# Initialize the shape constants.
image
=
data
[
'image'
]
boxes
=
data
[
'groundtruth_boxes'
]
classes
=
data
[
'groundtruth_classes'
]
if
self
.
_random_flip
:
# Randomly flip the image horizontally.
image
,
boxes
,
_
=
preprocess_ops
.
random_horizontal_flip
(
image
,
boxes
,
seed
=
self
.
_seed
)
if
not
data
[
'is_mosaic'
]:
image
,
infos
,
affine
=
self
.
_jitter_scale
(
image
,
[
self
.
_image_h
,
self
.
_image_w
],
self
.
_letter_box
,
self
.
_jitter
,
self
.
_random_pad
,
self
.
_aug_scale_min
,
self
.
_aug_scale_max
,
self
.
_aug_rand_translate
,
self
.
_aug_rand_angle
,
self
.
_aug_rand_perspective
)
# Clip and clean boxes.
boxes
,
inds
=
preprocessing_ops
.
transform_and_clip_boxes
(
boxes
,
infos
,
affine
=
affine
,
shuffle_boxes
=
False
,
area_thresh
=
self
.
_area_thresh
,
augment
=
True
,
seed
=
self
.
_seed
)
classes
=
tf
.
gather
(
classes
,
inds
)
info
=
infos
[
-
1
]
else
:
image
=
tf
.
image
.
resize
(
image
,
(
self
.
_image_h
,
self
.
_image_w
),
method
=
'nearest'
)
output_size
=
tf
.
cast
([
640
,
640
],
tf
.
float32
)
boxes_
=
bbox_ops
.
denormalize_boxes
(
boxes
,
output_size
)
inds
=
bbox_ops
.
get_non_empty_box_indices
(
boxes_
)
boxes
=
tf
.
gather
(
boxes
,
inds
)
classes
=
tf
.
gather
(
classes
,
inds
)
info
=
self
.
_pad_infos_object
(
image
)
# Apply scaling to the hue saturation and brightness of an image.
image
=
tf
.
cast
(
image
,
dtype
=
self
.
_dtype
)
image
=
image
/
255.0
image
=
preprocessing_ops
.
image_rand_hsv
(
image
,
self
.
_aug_rand_hue
,
self
.
_aug_rand_saturation
,
self
.
_aug_rand_brightness
,
seed
=
self
.
_seed
,
darknet
=
self
.
_darknet
)
# Cast the image to the selcted datatype.
image
,
labels
=
self
.
_build_label
(
image
,
boxes
,
classes
,
info
,
inds
,
data
,
is_training
=
True
)
return
image
,
labels
def
_parse_eval_data
(
self
,
data
):
"""Parses data for evaluation."""
# Get the image shape constants and cast the image to the selcted datatype.
image
=
tf
.
cast
(
data
[
'image'
],
dtype
=
self
.
_dtype
)
boxes
=
data
[
'groundtruth_boxes'
]
classes
=
data
[
'groundtruth_classes'
]
image
,
infos
,
_
=
preprocessing_ops
.
resize_and_jitter_image
(
image
,
[
self
.
_image_h
,
self
.
_image_w
],
letter_box
=
self
.
_letter_box
,
random_pad
=
False
,
shiftx
=
0.5
,
shifty
=
0.5
,
jitter
=
0.0
)
# Clip and clean boxes.
image
=
image
/
255.0
boxes
,
inds
=
preprocessing_ops
.
transform_and_clip_boxes
(
boxes
,
infos
,
shuffle_boxes
=
False
,
area_thresh
=
0.0
,
augment
=
True
)
classes
=
tf
.
gather
(
classes
,
inds
)
info
=
infos
[
-
1
]
image
,
labels
=
self
.
_build_label
(
image
,
boxes
,
classes
,
info
,
inds
,
data
,
is_training
=
False
)
return
image
,
labels
def
set_shape
(
self
,
values
,
pad_axis
=
0
,
pad_value
=
0
,
inds
=
None
):
"""Calls set shape for all input objects."""
if
inds
is
not
None
:
values
=
tf
.
gather
(
values
,
inds
)
vshape
=
values
.
get_shape
().
as_list
()
values
=
preprocessing_ops
.
pad_max_instances
(
values
,
self
.
_max_num_instances
,
pad_axis
=
pad_axis
,
pad_value
=
pad_value
)
vshape
[
pad_axis
]
=
self
.
_max_num_instances
values
.
set_shape
(
vshape
)
return
values
def
_build_label
(
self
,
image
,
gt_boxes
,
gt_classes
,
info
,
inds
,
data
,
is_training
=
True
):
"""Label construction for both the train and eval data."""
width
=
self
.
_image_w
height
=
self
.
_image_h
# Set the image shape.
imshape
=
image
.
get_shape
().
as_list
()
imshape
[
-
1
]
=
3
image
.
set_shape
(
imshape
)
labels
=
dict
()
(
labels
[
'inds'
],
labels
[
'upds'
],
labels
[
'true_conf'
])
=
self
.
_label_builder
(
gt_boxes
,
gt_classes
,
width
,
height
)
# Set/fix the boxes shape.
boxes
=
self
.
set_shape
(
gt_boxes
,
pad_axis
=
0
,
pad_value
=
0
)
classes
=
self
.
set_shape
(
gt_classes
,
pad_axis
=
0
,
pad_value
=-
1
)
# Build the dictionary set.
labels
.
update
({
'source_id'
:
utils
.
process_source_id
(
data
[
'source_id'
]),
'bbox'
:
tf
.
cast
(
boxes
,
dtype
=
self
.
_dtype
),
'classes'
:
tf
.
cast
(
classes
,
dtype
=
self
.
_dtype
),
})
# Update the labels dictionary.
if
not
is_training
:
# Sets up groundtruth data for evaluation.
groundtruths
=
{
'source_id'
:
labels
[
'source_id'
],
'height'
:
height
,
'width'
:
width
,
'num_detections'
:
tf
.
shape
(
gt_boxes
)[
0
],
'image_info'
:
info
,
'boxes'
:
gt_boxes
,
'classes'
:
gt_classes
,
'areas'
:
tf
.
gather
(
data
[
'groundtruth_area'
],
inds
),
'is_crowds'
:
tf
.
cast
(
tf
.
gather
(
data
[
'groundtruth_is_crowd'
],
inds
),
tf
.
int32
),
}
groundtruths
[
'source_id'
]
=
utils
.
process_source_id
(
groundtruths
[
'source_id'
])
groundtruths
=
utils
.
pad_groundtruths_to_fixed_size
(
groundtruths
,
self
.
_max_num_instances
)
labels
[
'groundtruths'
]
=
groundtruths
return
image
,
labels
official/vision/beta/projects/yolo/losses/yolo_loss.py
View file @
482823c8
...
...
@@ -33,7 +33,6 @@ class YoloLossBase(object, metaclass=abc.ABCMeta):
def
__init__
(
self
,
classes
,
mask
,
anchors
,
path_stride
=
1
,
ignore_thresh
=
0.7
,
...
...
@@ -52,8 +51,6 @@ class YoloLossBase(object, metaclass=abc.ABCMeta):
Args:
classes: `int` for the number of classes
mask: `List[int]` for the output level that this specific model output
level
anchors: `List[List[int]]` for the anchor boxes that are used in the model
at all levels. For anchor free prediction set the anchor list to be the
same as the image resolution.
...
...
@@ -85,11 +82,10 @@ class YoloLossBase(object, metaclass=abc.ABCMeta):
max_delta: gradient clipping to apply to the box loss.
"""
self
.
_loss_type
=
loss_type
self
.
_classes
=
tf
.
constant
(
tf
.
cast
(
classes
,
dtype
=
tf
.
int32
))
self
.
_num
=
tf
.
cast
(
len
(
mask
),
dtype
=
tf
.
int32
)
self
.
_classes
=
classes
self
.
_num
=
tf
.
cast
(
len
(
anchors
),
dtype
=
tf
.
int32
)
self
.
_truth_thresh
=
truth_thresh
self
.
_ignore_thresh
=
ignore_thresh
self
.
_masks
=
mask
self
.
_anchors
=
anchors
self
.
_iou_normalizer
=
iou_normalizer
...
...
@@ -111,8 +107,8 @@ class YoloLossBase(object, metaclass=abc.ABCMeta):
max_delta
=
self
.
_max_delta
)
self
.
_decode_boxes
=
functools
.
partial
(
loss_utils
.
get_predicted_box
,
**
box_kwargs
)
self
.
_search_pairs
=
lambda
pred_boxes
,
pred_classes
,
boxes
,
classes
,
scale
,
yxyx
:
(
None
,
None
,
None
,
None
)
# pylint:disable=line-too-long
self
.
_
build_per_path_attributes
(
)
self
.
_
search_pairs
=
lambda
*
args
:
(
None
,
None
,
None
,
None
)
self
.
_build_per_path_attributes
()
def
box_loss
(
self
,
true_box
,
pred_box
,
darknet
=
False
):
...
...
@@ -136,10 +132,15 @@ class YoloLossBase(object, metaclass=abc.ABCMeta):
scale
=
None
):
"""Search of all groundtruths to associate groundtruths to predictions."""
boxes
=
box_ops
.
yxyx_to_xcycwh
(
boxes
)
if
scale
is
not
None
:
boxes
=
boxes
*
tf
.
cast
(
tf
.
stop_gradient
(
scale
),
boxes
.
dtype
)
# Search all predictions against ground truths to find mathcing boxes for
# each pixel.
_
,
_
,
iou_max
,
_
=
self
.
_search_pairs
(
pred_boxes
,
pred_classes
,
boxes
,
classes
,
scale
=
scale
,
yxyx
=
True
)
_
,
_
,
iou_max
,
_
=
self
.
_search_pairs
(
pred_boxes
,
pred_classes
,
boxes
,
classes
)
if
iou_max
is
None
:
return
true_conf
,
tf
.
ones_like
(
true_conf
)
...
...
@@ -199,9 +200,6 @@ class YoloLossBase(object, metaclass=abc.ABCMeta):
grid_mask
)
=
self
.
_compute_loss
(
true_counts
,
inds
,
y_true
,
boxes
,
classes
,
y_pred
)
# Temporary metrics
box_loss
=
tf
.
stop_gradient
(
0.05
*
box_loss
/
self
.
_iou_normalizer
)
# Metric compute using done here to save time and resources.
sigmoid_conf
=
tf
.
stop_gradient
(
tf
.
sigmoid
(
pred_conf
))
iou
=
tf
.
stop_gradient
(
iou
)
...
...
@@ -222,21 +220,28 @@ class YoloLossBase(object, metaclass=abc.ABCMeta):
"""The actual logic to apply to the raw model for optimization."""
...
def
post_path_aggregation
(
self
,
loss
,
ground_truths
,
predictions
):
# pylint:disable=unused-argument
def
post_path_aggregation
(
self
,
loss
,
box_loss
,
conf_loss
,
class_loss
,
ground_truths
,
predictions
):
# pylint:disable=unused-argument
"""This method allows for post processing of a loss value.
After the loss has been aggregated across all the FPN levels some post
proceessing may need to occur to poroperly scale the loss. The default
behavior is to pass the loss through with no alterations.
behavior is to pass the loss through with no alterations. Passing the
individual losses for each mask will allow for aggeregation of loss across
paths for some losses.
Args:
loss: `tf.float` scalar for the actual loss.
box_loss: `tf.float` for the loss on the boxs only.
conf_loss: `tf.float` for the loss on the confidences only.
class_loss: `tf.float` for the loss on the classes only.
ground_truths: `Dict` holding all the ground truth tensors.
predictions: `Dict` holding all the predicted values.
Returns:
loss: `tf.float` scalar for the scaled loss.
"""
del
box_loss
,
conf_loss
,
class_loss
,
ground_truths
,
predictions
return
loss
@
abc
.
abstractmethod
...
...
@@ -280,7 +285,6 @@ class DarknetLoss(YoloLossBase):
association.
"""
self
.
_anchor_generator
=
loss_utils
.
GridGenerator
(
masks
=
self
.
_masks
,
anchors
=
self
.
_anchors
,
scale_anchors
=
self
.
_path_stride
)
...
...
@@ -314,8 +318,7 @@ class DarknetLoss(YoloLossBase):
anchor_grid
=
tf
.
stop_gradient
(
anchor_grid
)
# Split all the ground truths to use as seperate items in loss computation.
(
true_box
,
ind_mask
,
true_class
,
_
,
_
)
=
tf
.
split
(
y_true
,
[
4
,
1
,
1
,
1
,
1
],
axis
=-
1
)
(
true_box
,
ind_mask
,
true_class
)
=
tf
.
split
(
y_true
,
[
4
,
1
,
1
],
axis
=-
1
)
true_conf
=
tf
.
squeeze
(
true_conf
,
axis
=-
1
)
true_class
=
tf
.
squeeze
(
true_class
,
axis
=-
1
)
grid_mask
=
true_conf
...
...
@@ -432,13 +435,14 @@ class ScaledLoss(YoloLossBase):
association.
"""
self
.
_anchor_generator
=
loss_utils
.
GridGenerator
(
masks
=
self
.
_masks
,
anchors
=
self
.
_anchors
,
scale_anchors
=
self
.
_path_stride
)
if
self
.
_ignore_thresh
>
0.0
:
self
.
_search_pairs
=
loss_utils
.
PairWiseSearch
(
iou_type
=
self
.
_loss_type
,
any_match
=
False
,
min_conf
=
0.25
)
self
.
_cls_normalizer
=
self
.
_cls_normalizer
*
self
.
_classes
/
80
return
def
_compute_loss
(
self
,
true_counts
,
inds
,
y_true
,
boxes
,
classes
,
y_pred
):
...
...
@@ -457,8 +461,7 @@ class ScaledLoss(YoloLossBase):
width
,
height
,
batch_size
,
dtype
=
tf
.
float32
)
# Split the y_true list.
(
true_box
,
ind_mask
,
true_class
,
_
,
_
)
=
tf
.
split
(
y_true
,
[
4
,
1
,
1
,
1
,
1
],
axis
=-
1
)
(
true_box
,
ind_mask
,
true_class
)
=
tf
.
split
(
y_true
,
[
4
,
1
,
1
],
axis
=-
1
)
grid_mask
=
true_conf
=
tf
.
squeeze
(
true_conf
,
axis
=-
1
)
true_class
=
tf
.
squeeze
(
true_class
,
axis
=-
1
)
num_objs
=
tf
.
cast
(
tf
.
reduce_sum
(
ind_mask
),
dtype
=
y_pred
.
dtype
)
...
...
@@ -469,7 +472,7 @@ class ScaledLoss(YoloLossBase):
pred_box
,
pred_conf
,
pred_class
=
tf
.
split
(
y_pred
,
[
4
,
1
,
-
1
],
axis
=-
1
)
# Decode the boxes for loss compute.
scale
,
pred_box
,
_
=
self
.
_decode_boxes
(
scale
,
pred_box
,
pbg
=
self
.
_decode_boxes
(
fwidth
,
fheight
,
pred_box
,
anchor_grid
,
grid_points
,
darknet
=
False
)
# If the ignore threshold is enabled, search all boxes ignore all
...
...
@@ -477,20 +480,24 @@ class ScaledLoss(YoloLossBase):
# noted ground truth list.
if
self
.
_ignore_thresh
!=
0.0
:
(
_
,
obj_mask
)
=
self
.
_tiled_global_box_search
(
p
red_box
,
p
bg
,
tf
.
stop_gradient
(
tf
.
sigmoid
(
pred_class
)),
boxes
,
classes
,
true_conf
,
smoothed
=
False
,
scale
=
scal
e
)
scale
=
Non
e
)
# Scale and shift and select the ground truth boxes
# and predictions to the prediciton domain.
offset
=
tf
.
cast
(
tf
.
gather_nd
(
grid_points
,
inds
,
batch_dims
=
1
),
true_box
.
dtype
)
offset
=
tf
.
concat
([
offset
,
tf
.
zeros_like
(
offset
)],
axis
=-
1
)
true_box
=
loss_utils
.
apply_mask
(
ind_mask
,
(
scale
*
true_box
)
-
offset
)
if
self
.
_box_type
==
'anchor_free'
:
true_box
=
loss_utils
.
apply_mask
(
ind_mask
,
(
scale
*
self
.
_path_stride
*
true_box
))
else
:
offset
=
tf
.
cast
(
tf
.
gather_nd
(
grid_points
,
inds
,
batch_dims
=
1
),
true_box
.
dtype
)
offset
=
tf
.
concat
([
offset
,
tf
.
zeros_like
(
offset
)],
axis
=-
1
)
true_box
=
loss_utils
.
apply_mask
(
ind_mask
,
(
scale
*
true_box
)
-
offset
)
pred_box
=
loss_utils
.
apply_mask
(
ind_mask
,
tf
.
gather_nd
(
pred_box
,
inds
,
batch_dims
=
1
))
...
...
@@ -523,7 +530,9 @@ class ScaledLoss(YoloLossBase):
tf
.
expand_dims
(
true_conf
,
axis
=-
1
),
pred_conf
,
from_logits
=
True
)
if
self
.
_ignore_thresh
!=
0.0
:
bce
=
loss_utils
.
apply_mask
(
obj_mask
,
bce
)
conf_loss
=
tf
.
reduce_mean
(
bce
)
conf_loss
=
tf
.
reduce_sum
(
bce
)
/
tf
.
reduce_sum
(
obj_mask
)
else
:
conf_loss
=
tf
.
reduce_mean
(
bce
)
# Compute the cross entropy loss for the class maps.
class_loss
=
tf
.
keras
.
losses
.
binary_crossentropy
(
...
...
@@ -547,7 +556,8 @@ class ScaledLoss(YoloLossBase):
return
(
loss
,
box_loss
,
conf_loss
,
class_loss
,
mean_loss
,
iou
,
pred_conf
,
ind_mask
,
grid_mask
)
def
post_path_aggregation
(
self
,
loss
,
ground_truths
,
predictions
):
def
post_path_aggregation
(
self
,
loss
,
box_loss
,
conf_loss
,
class_loss
,
ground_truths
,
predictions
):
"""This method allows for post processing of a loss value.
By default the model will have about 3 FPN levels {3, 4, 5}, on
...
...
@@ -558,9 +568,11 @@ class ScaledLoss(YoloLossBase):
Args:
loss: `tf.float` scalar for the actual loss.
box_loss: `tf.float` for the loss on the boxs only.
conf_loss: `tf.float` for the loss on the confidences only.
class_loss: `tf.float` for the loss on the classes only.
ground_truths: `Dict` holding all the ground truth tensors.
predictions: `Dict` holding all the predicted values.
Returns:
loss: `tf.float` scalar for the scaled loss.
"""
...
...
@@ -568,7 +580,7 @@ class ScaledLoss(YoloLossBase):
return
loss
*
scale
def
cross_replica_aggregation
(
self
,
loss
,
num_replicas_in_sync
):
"""
In the scaled loss, take the sum of the loss across replicas
."""
"""
this method is not specific to each loss path, but each loss type
."""
return
loss
...
...
@@ -579,7 +591,6 @@ class YoloLoss:
keys
,
classes
,
anchors
,
masks
=
None
,
path_strides
=
None
,
truth_thresholds
=
None
,
ignore_thresholds
=
None
,
...
...
@@ -603,8 +614,6 @@ class YoloLoss:
anchors: `List[List[int]]` for the anchor boxes that are used in the model
at all levels. For anchor free prediction set the anchor list to be the
same as the image resolution.
masks: `List[int]` for the output level that this specific model output
level
path_strides: `Dict[int]` for how much to scale this level to get the
orginal input shape for each FPN path.
truth_thresholds: `Dict[float]` for the IOU value over which the loss is
...
...
@@ -651,8 +660,7 @@ class YoloLoss:
for
key
in
keys
:
self
.
_loss_dict
[
key
]
=
losses
[
loss_type
](
classes
=
classes
,
anchors
=
anchors
,
mask
=
masks
[
key
],
anchors
=
anchors
[
key
],
truth_thresh
=
truth_thresholds
[
key
],
ignore_thresh
=
ignore_thresholds
[
key
],
loss_type
=
loss_types
[
key
],
...
...
@@ -667,7 +675,7 @@ class YoloLoss:
update_on_repeat
=
update_on_repeat
,
label_smoothing
=
label_smoothing
)
def
__call__
(
self
,
ground_truth
,
predictions
,
use_reduced_logs
=
True
):
def
__call__
(
self
,
ground_truth
,
predictions
):
metric_dict
=
collections
.
defaultdict
(
dict
)
metric_dict
[
'net'
][
'box'
]
=
0
metric_dict
[
'net'
][
'class'
]
=
0
...
...
@@ -687,8 +695,10 @@ class YoloLoss:
# after computing the loss, scale loss as needed for aggregation
# across FPN levels
loss
=
self
.
_loss_dict
[
key
].
post_path_aggregation
(
loss
,
ground_truth
,
predictions
)
loss
=
self
.
_loss_dict
[
key
].
post_path_aggregation
(
loss
,
loss_box
,
loss_conf
,
loss_class
,
ground_truth
,
predictions
)
# after completing the scaling of the loss on each replica, handle
# scaling the loss for mergeing the loss across replicas
...
...
@@ -703,11 +713,6 @@ class YoloLoss:
metric_dict
[
key
][
'avg_iou'
]
=
tf
.
stop_gradient
(
avg_iou
)
metric_dict
[
key
][
'avg_obj'
]
=
tf
.
stop_gradient
(
avg_obj
)
if
not
use_reduced_logs
:
metric_dict
[
key
][
'conf_loss'
]
=
tf
.
stop_gradient
(
loss_conf
)
metric_dict
[
key
][
'box_loss'
]
=
tf
.
stop_gradient
(
loss_box
)
metric_dict
[
key
][
'class_loss'
]
=
tf
.
stop_gradient
(
loss_class
)
metric_dict
[
'net'
][
'box'
]
+=
tf
.
stop_gradient
(
loss_box
)
metric_dict
[
'net'
][
'class'
]
+=
tf
.
stop_gradient
(
loss_class
)
metric_dict
[
'net'
][
'conf'
]
+=
tf
.
stop_gradient
(
loss_conf
)
...
...
official/vision/beta/projects/yolo/losses/yolo_loss_test.py
View file @
482823c8
...
...
@@ -42,10 +42,11 @@ class YoloDecoderTest(parameterized.TestCase, tf.test.TestCase):
'5'
:
[
1
,
13
,
13
,
255
]
}
classes
=
80
masks
=
{
'3'
:
[
0
,
1
,
2
],
'4'
:
[
3
,
4
,
5
],
'5'
:
[
6
,
7
,
8
]}
anchors
=
[[
12.0
,
19.0
],
[
31.0
,
46.0
],
[
96.0
,
54.0
],
[
46.0
,
114.0
],
[
133.0
,
127.0
],
[
79.0
,
225.0
],
[
301.0
,
150.0
],
[
172.0
,
286.0
],
[
348.0
,
340.0
]]
anchors
=
{
'3'
:
[[
12.0
,
19.0
],
[
31.0
,
46.0
],
[
96.0
,
54.0
]],
'4'
:
[[
46.0
,
114.0
],
[
133.0
,
127.0
],
[
79.0
,
225.0
]],
'5'
:
[[
301.0
,
150.0
],
[
172.0
,
286.0
],
[
348.0
,
340.0
]]
}
keys
=
[
'3'
,
'4'
,
'5'
]
path_strides
=
{
key
:
2
**
int
(
key
)
for
key
in
keys
}
...
...
@@ -53,7 +54,6 @@ class YoloDecoderTest(parameterized.TestCase, tf.test.TestCase):
keys
,
classes
,
anchors
,
masks
=
masks
,
path_strides
=
path_strides
,
truth_thresholds
=
{
key
:
1.0
for
key
in
keys
},
ignore_thresholds
=
{
key
:
0.7
for
key
in
keys
},
...
...
@@ -79,7 +79,7 @@ class YoloDecoderTest(parameterized.TestCase, tf.test.TestCase):
'4'
:
[
1
,
300
,
3
],
'5'
:
[
1
,
300
,
3
]
},
tf
.
int32
)
truths
=
inpdict
({
'3'
:
[
1
,
300
,
8
],
'4'
:
[
1
,
300
,
8
],
'5'
:
[
1
,
300
,
8
]})
truths
=
inpdict
({
'3'
:
[
1
,
300
,
6
],
'4'
:
[
1
,
300
,
6
],
'5'
:
[
1
,
300
,
6
]})
boxes
=
tf
.
ones
([
1
,
300
,
4
],
dtype
=
tf
.
float32
)
classes
=
tf
.
ones
([
1
,
300
],
dtype
=
tf
.
float32
)
...
...
official/vision/beta/projects/yolo/modeling/backbones/darknet.py
View file @
482823c8
...
...
@@ -383,9 +383,11 @@ class Darknet(tf.keras.Model):
max_level
=
5
,
width_scale
=
1.0
,
depth_scale
=
1.0
,
use_reorg_input
=
False
,
csp_level_mod
=
(),
activation
=
None
,
use_sync_bn
=
False
,
use_separable_conv
=
False
,
norm_momentum
=
0.99
,
norm_epsilon
=
0.001
,
dilate
=
False
,
...
...
@@ -412,11 +414,13 @@ class Darknet(tf.keras.Model):
self
.
_norm_momentum
=
norm_momentum
self
.
_norm_epislon
=
norm_epsilon
self
.
_use_sync_bn
=
use_sync_bn
self
.
_use_separable_conv
=
use_separable_conv
self
.
_activation
=
activation
self
.
_kernel_regularizer
=
kernel_regularizer
self
.
_dilate
=
dilate
self
.
_width_scale
=
width_scale
self
.
_depth_scale
=
depth_scale
self
.
_use_reorg_input
=
use_reorg_input
self
.
_default_dict
=
{
'kernel_initializer'
:
self
.
_kernel_initializer
,
...
...
@@ -426,6 +430,7 @@ class Darknet(tf.keras.Model):
'norm_epsilon'
:
self
.
_norm_epislon
,
'use_sync_bn'
:
self
.
_use_sync_bn
,
'activation'
:
self
.
_activation
,
'use_separable_conv'
:
self
.
_use_separable_conv
,
'dilation_rate'
:
1
,
'name'
:
None
}
...
...
@@ -447,6 +452,9 @@ class Darknet(tf.keras.Model):
return
self
.
_splits
def
_build_struct
(
self
,
net
,
inputs
):
if
self
.
_use_reorg_input
:
inputs
=
nn_blocks
.
Reorg
()(
inputs
)
endpoints
=
collections
.
OrderedDict
()
stack_outputs
=
[
inputs
]
for
i
,
config
in
enumerate
(
net
):
...
...
@@ -662,25 +670,26 @@ class Darknet(tf.keras.Model):
@
factory
.
register_backbone_builder
(
'darknet'
)
def
build_darknet
(
input_specs
:
tf
.
keras
.
layers
.
InputSpec
,
backbone_c
f
g
:
hyperparams
.
Config
,
backbone_c
onfi
g
:
hyperparams
.
Config
,
norm_activation_config
:
hyperparams
.
Config
,
l2_regularizer
:
tf
.
keras
.
regularizers
.
Regularizer
=
None
)
->
tf
.
keras
.
Model
:
# pytype: disable=annotation-type-mismatch # typed-keras
l2_regularizer
:
tf
.
keras
.
regularizers
.
Regularizer
=
None
)
->
tf
.
keras
.
Model
:
# pytype: disable=annotation-type-mismatch # typed-keras
"""Builds darknet."""
backbone_cfg
=
backbone_cfg
.
get
()
backbone_config
=
backbone_config
.
get
()
model
=
Darknet
(
model_id
=
backbone_c
f
g
.
model_id
,
min_level
=
backbone_c
f
g
.
min_level
,
max_level
=
backbone_c
f
g
.
max_level
,
model_id
=
backbone_c
onfi
g
.
model_id
,
min_level
=
backbone_c
onfi
g
.
min_level
,
max_level
=
backbone_c
onfi
g
.
max_level
,
input_specs
=
input_specs
,
dilate
=
backbone_cfg
.
dilate
,
width_scale
=
backbone_cfg
.
width_scale
,
depth_scale
=
backbone_cfg
.
depth_scale
,
dilate
=
backbone_config
.
dilate
,
width_scale
=
backbone_config
.
width_scale
,
depth_scale
=
backbone_config
.
depth_scale
,
use_reorg_input
=
backbone_config
.
use_reorg_input
,
activation
=
norm_activation_config
.
activation
,
use_sync_bn
=
norm_activation_config
.
use_sync_bn
,
use_separable_conv
=
backbone_config
.
use_separable_conv
,
norm_momentum
=
norm_activation_config
.
norm_momentum
,
norm_epsilon
=
norm_activation_config
.
norm_epsilon
,
kernel_regularizer
=
l2_regularizer
)
model
.
summary
()
return
model
official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
View file @
482823c8
...
...
@@ -21,7 +21,7 @@ from official.vision.beta.projects.yolo.modeling.layers import nn_blocks
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'yolo'
)
class
_IdentityRoute
(
tf
.
keras
.
layers
.
Layer
):
def
call
(
self
,
inputs
):
def
call
(
self
,
inputs
):
# pylint: disable=arguments-differ
return
None
,
inputs
...
...
@@ -36,6 +36,7 @@ class YoloFPN(tf.keras.layers.Layer):
activation
=
'leaky'
,
fpn_filter_scale
=
1
,
use_sync_bn
=
False
,
use_separable_conv
=
False
,
norm_momentum
=
0.99
,
norm_epsilon
=
0.001
,
kernel_initializer
=
'VarianceScaling'
,
...
...
@@ -52,6 +53,7 @@ class YoloFPN(tf.keras.layers.Layer):
activation: `str`, the activation function to use typically leaky or mish.
fpn_filter_scale: `int`, scaling factor for the FPN filters.
use_sync_bn: if True, use synchronized batch normalization.
use_separable_conv: `bool` whether to use separable convs.
norm_momentum: `float`, normalization momentum for the moving average.
norm_epsilon: `float`, small float added to variance to avoid dividing by
zero.
...
...
@@ -66,6 +68,7 @@ class YoloFPN(tf.keras.layers.Layer):
self
.
_activation
=
activation
self
.
_use_sync_bn
=
use_sync_bn
self
.
_use_separable_conv
=
use_separable_conv
self
.
_norm_momentum
=
norm_momentum
self
.
_norm_epsilon
=
norm_epsilon
self
.
_kernel_initializer
=
kernel_initializer
...
...
@@ -78,6 +81,7 @@ class YoloFPN(tf.keras.layers.Layer):
self
.
_base_config
=
dict
(
activation
=
self
.
_activation
,
use_sync_bn
=
self
.
_use_sync_bn
,
use_separable_conv
=
self
.
_use_separable_conv
,
kernel_regularizer
=
self
.
_kernel_regularizer
,
kernel_initializer
=
self
.
_kernel_initializer
,
bias_regularizer
=
self
.
_bias_regularizer
,
...
...
@@ -181,6 +185,7 @@ class YoloPAN(tf.keras.layers.Layer):
csp_stack
=
False
,
activation
=
'leaky'
,
use_sync_bn
=
False
,
use_separable_conv
=
False
,
norm_momentum
=
0.99
,
norm_epsilon
=
0.001
,
kernel_initializer
=
'VarianceScaling'
,
...
...
@@ -200,6 +205,7 @@ class YoloPAN(tf.keras.layers.Layer):
csp_stack: `bool`, CSPize the FPN.
activation: `str`, the activation function to use typically leaky or mish.
use_sync_bn: if True, use synchronized batch normalization.
use_separable_conv: `bool` whether to use separable convs.
norm_momentum: `float`, normalization omentum for the moving average.
norm_epsilon: `float`, small float added to variance to avoid dividing
by zero.
...
...
@@ -220,6 +226,7 @@ class YoloPAN(tf.keras.layers.Layer):
self
.
_activation
=
activation
self
.
_use_sync_bn
=
use_sync_bn
self
.
_use_separable_conv
=
use_separable_conv
self
.
_norm_momentum
=
norm_momentum
self
.
_norm_epsilon
=
norm_epsilon
self
.
_kernel_initializer
=
kernel_initializer
...
...
@@ -236,6 +243,7 @@ class YoloPAN(tf.keras.layers.Layer):
self
.
_base_config
=
dict
(
activation
=
self
.
_activation
,
use_sync_bn
=
self
.
_use_sync_bn
,
use_separable_conv
=
self
.
_use_separable_conv
,
kernel_regularizer
=
self
.
_kernel_regularizer
,
kernel_initializer
=
self
.
_kernel_initializer
,
bias_regularizer
=
self
.
_bias_regularizer
,
...
...
@@ -371,6 +379,7 @@ class YoloDecoder(tf.keras.Model):
embed_spp
=
False
,
activation
=
'leaky'
,
use_sync_bn
=
False
,
use_separable_conv
=
False
,
norm_momentum
=
0.99
,
norm_epsilon
=
0.001
,
kernel_initializer
=
'VarianceScaling'
,
...
...
@@ -397,6 +406,7 @@ class YoloDecoder(tf.keras.Model):
embed_spp: `bool`, use the SPP found in the YoloV3 and V4 model.
activation: `str`, the activation function to use typically leaky or mish.
use_sync_bn: if True, use synchronized batch normalization.
use_separable_conv: `bool` wether to use separable convs.
norm_momentum: `float`, normalization omentum for the moving average.
norm_epsilon: `float`, small float added to variance to avoid dividing by
zero.
...
...
@@ -415,6 +425,7 @@ class YoloDecoder(tf.keras.Model):
self
.
_activation
=
activation
self
.
_use_sync_bn
=
use_sync_bn
self
.
_use_separable_conv
=
use_separable_conv
self
.
_norm_momentum
=
norm_momentum
self
.
_norm_epsilon
=
norm_epsilon
self
.
_kernel_initializer
=
kernel_initializer
...
...
@@ -426,6 +437,7 @@ class YoloDecoder(tf.keras.Model):
csp_stack
=
csp_stack
,
activation
=
self
.
_activation
,
use_sync_bn
=
self
.
_use_sync_bn
,
use_separable_conv
=
self
.
_use_separable_conv
,
fpn_filter_scale
=
fpn_filter_scale
,
norm_momentum
=
self
.
_norm_momentum
,
norm_epsilon
=
self
.
_norm_epsilon
,
...
...
official/vision/beta/projects/yolo/modeling/heads/yolo_head.py
View file @
482823c8
...
...
@@ -34,6 +34,7 @@ class YoloHead(tf.keras.layers.Layer):
bias_regularizer
=
None
,
activation
=
None
,
smart_bias
=
False
,
use_separable_conv
=
False
,
**
kwargs
):
"""Yolo Prediction Head initialization function.
...
...
@@ -52,7 +53,8 @@ class YoloHead(tf.keras.layers.Layer):
kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
activation: `str`, the activation function to use typically leaky or mish.
smart_bias: `bool` whether or not use smart bias.
smart_bias: `bool`, whether to use smart bias.
use_separable_conv: `bool` wether to use separable convs.
**kwargs: keyword arguments to be passed.
"""
...
...
@@ -70,6 +72,7 @@ class YoloHead(tf.keras.layers.Layer):
self
.
_output_conv
=
(
classes
+
output_extras
+
5
)
*
boxes_per_level
self
.
_smart_bias
=
smart_bias
self
.
_use_separable_conv
=
use_separable_conv
self
.
_base_config
=
dict
(
activation
=
activation
,
...
...
@@ -85,6 +88,7 @@ class YoloHead(tf.keras.layers.Layer):
strides
=
(
1
,
1
),
padding
=
'same'
,
use_bn
=
False
,
use_separable_conv
=
self
.
_use_separable_conv
,
**
self
.
_base_config
)
def
bias_init
(
self
,
scale
,
inshape
,
isize
=
640
,
no_per_conf
=
8
):
...
...
official/vision/beta/projects/yolo/modeling/layers/detection_generator.py
View file @
482823c8
...
...
@@ -26,7 +26,6 @@ class YoloLayer(tf.keras.Model):
"""Yolo layer (detection generator)."""
def
__init__
(
self
,
masks
,
anchors
,
classes
,
iou_thresh
=
0.0
,
...
...
@@ -52,8 +51,6 @@ class YoloLayer(tf.keras.Model):
"""Parameters for the loss functions used at each detection head output.
Args:
masks: `List[int]` for the output level that this specific model output
level.
anchors: `List[List[int]]` for the anchor boxes that are used in the
model.
classes: `int` for the number of classes.
...
...
@@ -107,7 +104,6 @@ class YoloLayer(tf.keras.Model):
**kwargs: Addtional keyword arguments.
"""
super
().
__init__
(
**
kwargs
)
self
.
_masks
=
masks
self
.
_anchors
=
anchors
self
.
_thresh
=
iou_thresh
self
.
_ignore_thresh
=
ignore_thresh
...
...
@@ -127,30 +123,24 @@ class YoloLayer(tf.keras.Model):
self
.
_pre_nms_points
=
pre_nms_points
self
.
_label_smoothing
=
label_smoothing
self
.
_keys
=
list
(
masks
.
keys
())
self
.
_keys
=
list
(
anchors
.
keys
())
self
.
_len_keys
=
len
(
self
.
_keys
)
self
.
_box_type
=
box_type
self
.
_path_scale
=
path_scale
or
{
key
:
2
**
int
(
key
)
for
key
,
_
in
masks
.
items
()
}
self
.
_path_scale
=
path_scale
or
{
key
:
2
**
int
(
key
)
for
key
in
self
.
_keys
}
self
.
_nms_type
=
nms_type
self
.
_scale_xy
=
scale_xy
or
{
key
:
1.0
for
key
,
_
in
mask
s
.
items
()}
self
.
_scale_xy
=
scale_xy
or
{
key
:
1.0
for
key
,
_
in
anchor
s
.
items
()}
self
.
_generator
=
{}
self
.
_len_mask
=
{}
for
key
in
self
.
_keys
:
anchors
=
[
self
.
_anchors
[
mask
]
for
mask
in
self
.
_masks
[
key
]
]
self
.
_generator
[
key
]
=
self
.
get_generators
(
anchors
,
self
.
_path_scale
[
key
],
# pylint: disable=assignment-from-none
key
)
self
.
_len_mask
[
key
]
=
len
(
self
.
_masks
[
key
]
)
anchors
=
self
.
_anchors
[
key
]
self
.
_generator
[
key
]
=
loss_utils
.
GridGenerator
(
anchors
,
scale_anchors
=
self
.
_path_scale
[
key
]
)
self
.
_len_mask
[
key
]
=
len
(
anchors
)
return
def
get_generators
(
self
,
anchors
,
path_scale
,
path_key
):
anchor_generator
=
loss_utils
.
GridGenerator
(
anchors
,
scale_anchors
=
path_scale
)
return
anchor_generator
def
parse_prediction_path
(
self
,
key
,
inputs
):
shape_
=
tf
.
shape
(
inputs
)
shape
=
inputs
.
get_shape
().
as_list
()
...
...
@@ -280,18 +270,19 @@ class YoloLayer(tf.keras.Model):
'num_detections'
:
num_detections
,
}
@
property
def
losses
(
self
):
def
get_losses
(
self
):
"""Generates a dictionary of losses to apply to each path.
Done in the detection generator because all parameters are the same
across both loss and detection generator
across both loss and detection generator.
Returns:
Dict[str, tf.Tensor] of losses
"""
loss
=
yolo_loss
.
YoloLoss
(
keys
=
self
.
_keys
,
classes
=
self
.
_classes
,
anchors
=
self
.
_anchors
,
masks
=
self
.
_masks
,
path_strides
=
self
.
_path_scale
,
truth_thresholds
=
self
.
_truth_thresh
,
ignore_thresholds
=
self
.
_ignore_thresh
,
...
...
@@ -310,7 +301,6 @@ class YoloLayer(tf.keras.Model):
def
get_config
(
self
):
return
{
'masks'
:
dict
(
self
.
_masks
),
'anchors'
:
[
list
(
a
)
for
a
in
self
.
_anchors
],
'thresh'
:
self
.
_thresh
,
'max_boxes'
:
self
.
_max_boxes
,
...
...
official/vision/beta/projects/yolo/modeling/layers/detection_generator_test.py
View file @
482823c8
...
...
@@ -13,7 +13,6 @@
# limitations under the License.
"""Tests for yolo detection generator."""
from
absl.testing
import
parameterized
import
tensorflow
as
tf
...
...
@@ -35,14 +34,15 @@ class YoloDecoderTest(parameterized.TestCase, tf.test.TestCase):
'5'
:
[
1
,
13
,
13
,
255
]
}
classes
=
80
masks
=
{
'3'
:
[
0
,
1
,
2
],
'4'
:
[
3
,
4
,
5
],
'5'
:
[
6
,
7
,
8
]}
anchors
=
[[
12.0
,
19.0
],
[
31.0
,
46.0
],
[
96.0
,
54.0
],
[
46.0
,
114.0
],
[
133.0
,
127.0
],
[
79.0
,
225.0
],
[
301.0
,
150.0
],
[
172.0
,
286.0
],
[
348.0
,
340.0
]]
box_type
=
{
key
:
'scaled'
for
key
in
masks
.
keys
()}
layer
=
dg
.
YoloLayer
(
masks
,
anchors
,
classes
,
box_type
=
box_type
,
max_boxes
=
10
)
anchors
=
{
'3'
:
[[
12.0
,
19.0
],
[
31.0
,
46.0
],
[
96.0
,
54.0
]],
'4'
:
[[
46.0
,
114.0
],
[
133.0
,
127.0
],
[
79.0
,
225.0
]],
'5'
:
[[
301.0
,
150.0
],
[
172.0
,
286.0
],
[
348.0
,
340.0
]]
}
box_type
=
{
key
:
'scaled'
for
key
in
anchors
.
keys
()}
layer
=
dg
.
YoloLayer
(
anchors
,
classes
,
box_type
=
box_type
,
max_boxes
=
10
)
inputs
=
{}
for
key
in
input_shape
:
...
...
official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py
View file @
482823c8
This diff is collapsed.
Click to expand it.
official/vision/beta/projects/yolo/modeling/yolo_model.py
View file @
482823c8
...
...
@@ -16,7 +16,6 @@
import
tensorflow
as
tf
# static base Yolo Models that do not require configuration
# similar to a backbone model id.
...
...
@@ -104,7 +103,7 @@ class Yolo(tf.keras.Model):
self
.
_backbone
=
backbone
self
.
_decoder
=
decoder
self
.
_head
=
head
self
.
_
filte
r
=
detection_generator
self
.
_
detection_generato
r
=
detection_generator
return
def
call
(
self
,
inputs
,
training
=
False
):
...
...
@@ -115,7 +114,7 @@ class Yolo(tf.keras.Model):
return
{
"raw_output"
:
raw_predictions
}
else
:
# Post-processing.
predictions
=
self
.
_
filte
r
(
raw_predictions
)
predictions
=
self
.
_
detection_generato
r
(
raw_predictions
)
predictions
.
update
({
"raw_output"
:
raw_predictions
})
return
predictions
...
...
@@ -132,8 +131,8 @@ class Yolo(tf.keras.Model):
return
self
.
_head
@
property
def
filte
r
(
self
):
return
self
.
_
filte
r
def
detection_generato
r
(
self
):
return
self
.
_
detection_generato
r
def
get_config
(
self
):
return
self
.
_config_dict
...
...
official/vision/beta/projects/yolo/ops/anchor.py
0 → 100644
View file @
482823c8
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Yolo Anchor labler."""
import
numpy
as
np
import
tensorflow
as
tf
from
official.vision.beta.projects.yolo.ops
import
box_ops
from
official.vision.beta.projects.yolo.ops
import
loss_utils
from
official.vision.beta.projects.yolo.ops
import
preprocessing_ops
INF
=
10000000
def
get_best_anchor
(
y_true
,
anchors
,
stride
,
width
=
1
,
height
=
1
,
iou_thresh
=
0.25
,
best_match_only
=
False
,
use_tie_breaker
=
True
):
"""Get the correct anchor that is assoiciated with each box using IOU.
Args:
y_true: tf.Tensor[] for the list of bounding boxes in the yolo format.
anchors: list or tensor for the anchor boxes to be used in prediction found
via Kmeans.
stride: `int` stride for the anchors.
width: int for the image width.
height: int for the image height.
iou_thresh: `float` the minimum iou threshold to use for selecting boxes for
each level.
best_match_only: `bool` if the box only has one match and it is less than
the iou threshold, when set to True, this match will be dropped as no
anchors can be linked to it.
use_tie_breaker: `bool` if there is many anchors for a given box, then
attempt to use all of them, if False, only the first matching box will be
used.
Returns:
tf.Tensor: y_true with the anchor associated with each ground truth box
known
"""
with
tf
.
name_scope
(
'get_best_anchor'
):
width
=
tf
.
cast
(
width
,
dtype
=
tf
.
float32
)
height
=
tf
.
cast
(
height
,
dtype
=
tf
.
float32
)
scaler
=
tf
.
convert_to_tensor
([
width
,
height
])
# scale to levels houts width and height
true_wh
=
tf
.
cast
(
y_true
[...,
2
:
4
],
dtype
=
tf
.
float32
)
*
scaler
# scale down from large anchor to small anchor type
anchors
=
tf
.
cast
(
anchors
,
dtype
=
tf
.
float32
)
/
stride
k
=
tf
.
shape
(
anchors
)[
0
]
anchors
=
tf
.
concat
([
tf
.
zeros_like
(
anchors
),
anchors
],
axis
=-
1
)
truth_comp
=
tf
.
concat
([
tf
.
zeros_like
(
true_wh
),
true_wh
],
axis
=-
1
)
if
iou_thresh
>=
1.0
:
anchors
=
tf
.
expand_dims
(
anchors
,
axis
=-
2
)
truth_comp
=
tf
.
expand_dims
(
truth_comp
,
axis
=-
3
)
aspect
=
truth_comp
[...,
2
:
4
]
/
anchors
[...,
2
:
4
]
aspect
=
tf
.
where
(
tf
.
math
.
is_nan
(
aspect
),
tf
.
zeros_like
(
aspect
),
aspect
)
aspect
=
tf
.
maximum
(
aspect
,
1
/
aspect
)
aspect
=
tf
.
where
(
tf
.
math
.
is_nan
(
aspect
),
tf
.
zeros_like
(
aspect
),
aspect
)
aspect
=
tf
.
reduce_max
(
aspect
,
axis
=-
1
)
values
,
indexes
=
tf
.
math
.
top_k
(
tf
.
transpose
(
-
aspect
,
perm
=
[
1
,
0
]),
k
=
tf
.
cast
(
k
,
dtype
=
tf
.
int32
),
sorted
=
True
)
values
=
-
values
ind_mask
=
tf
.
cast
(
values
<
iou_thresh
,
dtype
=
indexes
.
dtype
)
else
:
truth_comp
=
box_ops
.
xcycwh_to_yxyx
(
truth_comp
)
anchors
=
box_ops
.
xcycwh_to_yxyx
(
anchors
)
iou_raw
=
box_ops
.
aggregated_comparitive_iou
(
truth_comp
,
anchors
,
iou_type
=
3
,
)
values
,
indexes
=
tf
.
math
.
top_k
(
iou_raw
,
k
=
tf
.
cast
(
k
,
dtype
=
tf
.
int32
),
sorted
=
True
)
ind_mask
=
tf
.
cast
(
values
>=
iou_thresh
,
dtype
=
indexes
.
dtype
)
# pad the indexs such that all values less than the thresh are -1
# add one, multiply the mask to zeros all the bad locations
# subtract 1 makeing all the bad locations 0.
if
best_match_only
:
iou_index
=
((
indexes
[...,
0
:]
+
1
)
*
ind_mask
[...,
0
:])
-
1
elif
use_tie_breaker
:
iou_index
=
tf
.
concat
([
tf
.
expand_dims
(
indexes
[...,
0
],
axis
=-
1
),
((
indexes
[...,
1
:]
+
1
)
*
ind_mask
[...,
1
:])
-
1
],
axis
=-
1
)
else
:
iou_index
=
tf
.
concat
([
tf
.
expand_dims
(
indexes
[...,
0
],
axis
=-
1
),
tf
.
zeros_like
(
indexes
[...,
1
:])
-
1
],
axis
=-
1
)
return
tf
.
cast
(
iou_index
,
dtype
=
tf
.
float32
),
tf
.
cast
(
values
,
dtype
=
tf
.
float32
)
class
YoloAnchorLabeler
:
"""Anchor labeler for the Yolo Models."""
def
__init__
(
self
,
anchors
=
None
,
anchor_free_level_limits
=
None
,
level_strides
=
None
,
center_radius
=
None
,
max_num_instances
=
200
,
match_threshold
=
0.25
,
best_matches_only
=
False
,
use_tie_breaker
=
True
,
darknet
=
False
,
dtype
=
'float32'
):
"""Initialization for anchor labler.
Args:
anchors: `Dict[List[Union[int, float]]]` values for each anchor box.
anchor_free_level_limits: `List` the box sizes that will be allowed at
each FPN level as is done in the FCOS and YOLOX paper for anchor free
box assignment.
level_strides: `Dict[int]` for how much the model scales down the images
at the each level.
center_radius: `Dict[float]` for radius around each box center to search
for extra centers in each level.
max_num_instances: `int` for the number of boxes to compute loss on.
match_threshold: `float` indicating the threshold over which an anchor
will be considered for prediction, at zero, all the anchors will be used
and at 1.0 only the best will be used. for anchor thresholds larger than
1.0 we stop using the IOU for anchor comparison and resort directly to
comparing the width and height, this is used for the scaled models.
best_matches_only: `boolean` indicating how boxes are selected for
optimization.
use_tie_breaker: `boolean` indicating whether to use the anchor threshold
value.
darknet: `boolean` indicating which data pipeline to use. Setting to True
swaps the pipeline to output images realtive to Yolov4 and older.
dtype: `str` indicating the output datatype of the datapipeline selecting
from {"float32", "float16", "bfloat16"}.
"""
self
.
anchors
=
anchors
self
.
masks
=
self
.
_get_mask
()
self
.
anchor_free_level_limits
=
self
.
_get_level_limits
(
anchor_free_level_limits
)
if
darknet
and
self
.
anchor_free_level_limits
is
None
:
center_radius
=
None
self
.
keys
=
self
.
anchors
.
keys
()
if
self
.
anchor_free_level_limits
is
not
None
:
maxim
=
2000
match_threshold
=
-
0.01
self
.
num_instances
=
{
key
:
maxim
for
key
in
self
.
keys
}
elif
not
darknet
:
self
.
num_instances
=
{
key
:
(
6
-
i
)
*
max_num_instances
for
i
,
key
in
enumerate
(
self
.
keys
)
}
else
:
self
.
num_instances
=
{
key
:
max_num_instances
for
key
in
self
.
keys
}
self
.
center_radius
=
center_radius
self
.
level_strides
=
level_strides
self
.
match_threshold
=
match_threshold
self
.
best_matches_only
=
best_matches_only
self
.
use_tie_breaker
=
use_tie_breaker
self
.
dtype
=
dtype
def
_get_mask
(
self
):
"""For each level get indexs of each anchor for box search across levels."""
masks
=
{}
start
=
0
minimum
=
int
(
min
(
self
.
anchors
.
keys
()))
maximum
=
int
(
max
(
self
.
anchors
.
keys
()))
for
i
in
range
(
minimum
,
maximum
+
1
):
per_scale
=
len
(
self
.
anchors
[
str
(
i
)])
masks
[
str
(
i
)]
=
list
(
range
(
start
,
per_scale
+
start
))
start
+=
per_scale
return
masks
def
_get_level_limits
(
self
,
level_limits
):
"""For each level receptive feild range for anchor free box placement."""
if
level_limits
is
not
None
:
level_limits_dict
=
{}
level_limits
=
[
0.0
]
+
level_limits
+
[
np
.
inf
]
for
i
,
key
in
enumerate
(
self
.
anchors
.
keys
()):
level_limits_dict
[
key
]
=
level_limits
[
i
:
i
+
2
]
else
:
level_limits_dict
=
None
return
level_limits_dict
def
_tie_breaking_search
(
self
,
anchors
,
mask
,
boxes
,
classes
):
"""After search, link each anchor ind to the correct map in ground truth."""
mask
=
tf
.
cast
(
tf
.
reshape
(
mask
,
[
1
,
1
,
1
,
-
1
]),
anchors
.
dtype
)
anchors
=
tf
.
expand_dims
(
anchors
,
axis
=-
1
)
viable
=
tf
.
where
(
tf
.
squeeze
(
anchors
==
mask
,
axis
=
0
))
gather_id
,
_
,
anchor_id
=
tf
.
split
(
viable
,
3
,
axis
=-
1
)
boxes
=
tf
.
gather_nd
(
boxes
,
gather_id
)
classes
=
tf
.
gather_nd
(
classes
,
gather_id
)
classes
=
tf
.
expand_dims
(
classes
,
axis
=-
1
)
classes
=
tf
.
cast
(
classes
,
boxes
.
dtype
)
anchor_id
=
tf
.
cast
(
anchor_id
,
boxes
.
dtype
)
return
boxes
,
classes
,
anchor_id
def
_get_anchor_id
(
self
,
key
,
boxes
,
classes
,
width
,
height
,
stride
,
iou_index
=
None
):
"""Find the object anchor assignments in an anchor based paradigm."""
# find the best anchor
anchors
=
self
.
anchors
[
key
]
num_anchors
=
len
(
anchors
)
if
self
.
best_matches_only
:
# get the best anchor for each box
iou_index
,
_
=
get_best_anchor
(
boxes
,
anchors
,
stride
,
width
=
width
,
height
=
height
,
best_match_only
=
True
,
iou_thresh
=
self
.
match_threshold
)
mask
=
range
(
num_anchors
)
else
:
# search is done across FPN levels, get the mask of anchor indexes
# corralated to this level.
mask
=
self
.
masks
[
key
]
# search for the correct box to use
(
boxes
,
classes
,
anchors
)
=
self
.
_tie_breaking_search
(
iou_index
,
mask
,
boxes
,
classes
)
return
boxes
,
classes
,
anchors
,
num_anchors
def
_get_centers
(
self
,
boxes
,
classes
,
anchors
,
width
,
height
,
scale_xy
):
"""Find the object center assignments in an anchor based paradigm."""
offset
=
tf
.
cast
(
0.5
*
(
scale_xy
-
1
),
boxes
.
dtype
)
grid_xy
,
_
=
tf
.
split
(
boxes
,
2
,
axis
=-
1
)
wh_scale
=
tf
.
cast
(
tf
.
convert_to_tensor
([
width
,
height
]),
boxes
.
dtype
)
grid_xy
=
grid_xy
*
wh_scale
centers
=
tf
.
math
.
floor
(
grid_xy
)
if
offset
!=
0.0
:
clamp
=
lambda
x
,
ma
:
tf
.
maximum
(
# pylint:disable=g-long-lambda
tf
.
minimum
(
x
,
tf
.
cast
(
ma
,
x
.
dtype
)),
tf
.
zeros_like
(
x
))
grid_xy_index
=
grid_xy
-
centers
positive_shift
=
((
grid_xy_index
<
offset
)
&
(
grid_xy
>
1.
))
negative_shift
=
((
grid_xy_index
>
(
1
-
offset
))
&
(
grid_xy
<
(
wh_scale
-
1.
)))
zero
,
_
=
tf
.
split
(
tf
.
ones_like
(
positive_shift
),
2
,
axis
=-
1
)
shift_mask
=
tf
.
concat
([
zero
,
positive_shift
,
negative_shift
],
axis
=-
1
)
offset
=
tf
.
cast
([[
0
,
0
],
[
1
,
0
],
[
0
,
1
],
[
-
1
,
0
],
[
0
,
-
1
]],
offset
.
dtype
)
*
offset
num_shifts
=
tf
.
shape
(
shift_mask
)
num_shifts
=
num_shifts
[
-
1
]
boxes
=
tf
.
tile
(
tf
.
expand_dims
(
boxes
,
axis
=-
2
),
[
1
,
num_shifts
,
1
])
classes
=
tf
.
tile
(
tf
.
expand_dims
(
classes
,
axis
=-
2
),
[
1
,
num_shifts
,
1
])
anchors
=
tf
.
tile
(
tf
.
expand_dims
(
anchors
,
axis
=-
2
),
[
1
,
num_shifts
,
1
])
shift_mask
=
tf
.
cast
(
shift_mask
,
boxes
.
dtype
)
shift_ind
=
shift_mask
*
tf
.
range
(
0
,
num_shifts
,
dtype
=
boxes
.
dtype
)
shift_ind
=
shift_ind
-
(
1
-
shift_mask
)
shift_ind
=
tf
.
expand_dims
(
shift_ind
,
axis
=-
1
)
boxes_and_centers
=
tf
.
concat
([
boxes
,
classes
,
anchors
,
shift_ind
],
axis
=-
1
)
boxes_and_centers
=
tf
.
reshape
(
boxes_and_centers
,
[
-
1
,
7
])
_
,
center_ids
=
tf
.
split
(
boxes_and_centers
,
[
6
,
1
],
axis
=-
1
)
select
=
tf
.
where
(
center_ids
>=
0
)
select
,
_
=
tf
.
split
(
select
,
2
,
axis
=-
1
)
boxes_and_centers
=
tf
.
gather_nd
(
boxes_and_centers
,
select
)
center_ids
=
tf
.
gather_nd
(
center_ids
,
select
)
center_ids
=
tf
.
cast
(
center_ids
,
tf
.
int32
)
shifts
=
tf
.
gather_nd
(
offset
,
center_ids
)
boxes
,
classes
,
anchors
,
_
=
tf
.
split
(
boxes_and_centers
,
[
4
,
1
,
1
,
1
],
axis
=-
1
)
grid_xy
,
_
=
tf
.
split
(
boxes
,
2
,
axis
=-
1
)
centers
=
tf
.
math
.
floor
(
grid_xy
*
wh_scale
-
shifts
)
centers
=
clamp
(
centers
,
wh_scale
-
1
)
x
,
y
=
tf
.
split
(
centers
,
2
,
axis
=-
1
)
centers
=
tf
.
cast
(
tf
.
concat
([
y
,
x
,
anchors
],
axis
=-
1
),
tf
.
int32
)
return
boxes
,
classes
,
centers
def
_get_anchor_free
(
self
,
key
,
boxes
,
classes
,
height
,
width
,
stride
,
center_radius
):
"""Find the box assignements in an anchor free paradigm."""
level_limits
=
self
.
anchor_free_level_limits
[
key
]
gen
=
loss_utils
.
GridGenerator
(
anchors
=
[[
1
,
1
]],
scale_anchors
=
stride
)
grid_points
=
gen
(
width
,
height
,
1
,
boxes
.
dtype
)[
0
]
grid_points
=
tf
.
squeeze
(
grid_points
,
axis
=
0
)
box_list
=
boxes
class_list
=
classes
grid_points
=
(
grid_points
+
0.5
)
*
stride
x_centers
,
y_centers
=
grid_points
[...,
0
],
grid_points
[...,
1
]
boxes
*=
(
tf
.
convert_to_tensor
([
width
,
height
,
width
,
height
])
*
stride
)
tlbr_boxes
=
box_ops
.
xcycwh_to_yxyx
(
boxes
)
boxes
=
tf
.
reshape
(
boxes
,
[
1
,
1
,
-
1
,
4
])
tlbr_boxes
=
tf
.
reshape
(
tlbr_boxes
,
[
1
,
1
,
-
1
,
4
])
if
self
.
use_tie_breaker
:
area
=
tf
.
reduce_prod
(
boxes
[...,
2
:],
axis
=-
1
)
# check if the box is in the receptive feild of the this fpn level
b_t
=
y_centers
-
tlbr_boxes
[...,
0
]
b_l
=
x_centers
-
tlbr_boxes
[...,
1
]
b_b
=
tlbr_boxes
[...,
2
]
-
y_centers
b_r
=
tlbr_boxes
[...,
3
]
-
x_centers
box_delta
=
tf
.
stack
([
b_t
,
b_l
,
b_b
,
b_r
],
axis
=-
1
)
if
level_limits
is
not
None
:
max_reg_targets_per_im
=
tf
.
reduce_max
(
box_delta
,
axis
=-
1
)
gt_min
=
max_reg_targets_per_im
>=
level_limits
[
0
]
gt_max
=
max_reg_targets_per_im
<=
level_limits
[
1
]
is_in_boxes
=
tf
.
logical_and
(
gt_min
,
gt_max
)
else
:
is_in_boxes
=
tf
.
reduce_min
(
box_delta
,
axis
=-
1
)
>
0.0
is_in_boxes_all
=
tf
.
reduce_any
(
is_in_boxes
,
axis
=
(
0
,
1
),
keepdims
=
True
)
# check if the center is in the receptive feild of the this fpn level
c_t
=
y_centers
-
(
boxes
[...,
1
]
-
center_radius
*
stride
)
c_l
=
x_centers
-
(
boxes
[...,
0
]
-
center_radius
*
stride
)
c_b
=
(
boxes
[...,
1
]
+
center_radius
*
stride
)
-
y_centers
c_r
=
(
boxes
[...,
0
]
+
center_radius
*
stride
)
-
x_centers
centers_delta
=
tf
.
stack
([
c_t
,
c_l
,
c_b
,
c_r
],
axis
=-
1
)
is_in_centers
=
tf
.
reduce_min
(
centers_delta
,
axis
=-
1
)
>
0.0
is_in_centers_all
=
tf
.
reduce_any
(
is_in_centers
,
axis
=
(
0
,
1
),
keepdims
=
True
)
# colate all masks to get the final locations
is_in_index
=
tf
.
logical_or
(
is_in_boxes_all
,
is_in_centers_all
)
is_in_boxes_and_center
=
tf
.
logical_and
(
is_in_boxes
,
is_in_centers
)
is_in_boxes_and_center
=
tf
.
logical_and
(
is_in_index
,
is_in_boxes_and_center
)
if
self
.
use_tie_breaker
:
boxes_all
=
tf
.
cast
(
is_in_boxes_and_center
,
area
.
dtype
)
boxes_all
=
((
boxes_all
*
area
)
+
((
1
-
boxes_all
)
*
INF
))
boxes_min
=
tf
.
reduce_min
(
boxes_all
,
axis
=-
1
,
keepdims
=
True
)
boxes_min
=
tf
.
where
(
boxes_min
==
INF
,
-
1.0
,
boxes_min
)
is_in_boxes_and_center
=
boxes_all
==
boxes_min
# construct the index update grid
reps
=
tf
.
reduce_sum
(
tf
.
cast
(
is_in_boxes_and_center
,
tf
.
int16
),
axis
=-
1
)
indexes
=
tf
.
cast
(
tf
.
where
(
is_in_boxes_and_center
),
tf
.
int32
)
y
,
x
,
t
=
tf
.
split
(
indexes
,
3
,
axis
=-
1
)
boxes
=
tf
.
gather_nd
(
box_list
,
t
)
classes
=
tf
.
cast
(
tf
.
gather_nd
(
class_list
,
t
),
boxes
.
dtype
)
reps
=
tf
.
gather_nd
(
reps
,
tf
.
concat
([
y
,
x
],
axis
=-
1
))
reps
=
tf
.
cast
(
tf
.
expand_dims
(
reps
,
axis
=-
1
),
boxes
.
dtype
)
classes
=
tf
.
cast
(
tf
.
expand_dims
(
classes
,
axis
=-
1
),
boxes
.
dtype
)
conf
=
tf
.
ones_like
(
classes
)
# return the samples and the indexes
samples
=
tf
.
concat
([
boxes
,
conf
,
classes
],
axis
=-
1
)
indexes
=
tf
.
concat
([
y
,
x
,
tf
.
zeros_like
(
t
)],
axis
=-
1
)
return
indexes
,
samples
def
build_label_per_path
(
self
,
key
,
boxes
,
classes
,
width
,
height
,
iou_index
=
None
):
"""Builds the labels for one path."""
stride
=
self
.
level_strides
[
key
]
scale_xy
=
self
.
center_radius
[
key
]
if
self
.
center_radius
is
not
None
else
1
width
=
tf
.
cast
(
width
//
stride
,
boxes
.
dtype
)
height
=
tf
.
cast
(
height
//
stride
,
boxes
.
dtype
)
if
self
.
anchor_free_level_limits
is
None
:
(
boxes
,
classes
,
anchors
,
num_anchors
)
=
self
.
_get_anchor_id
(
key
,
boxes
,
classes
,
width
,
height
,
stride
,
iou_index
=
iou_index
)
boxes
,
classes
,
centers
=
self
.
_get_centers
(
boxes
,
classes
,
anchors
,
width
,
height
,
scale_xy
)
ind_mask
=
tf
.
ones_like
(
classes
)
updates
=
tf
.
concat
([
boxes
,
ind_mask
,
classes
],
axis
=-
1
)
else
:
num_anchors
=
1
(
centers
,
updates
)
=
self
.
_get_anchor_free
(
key
,
boxes
,
classes
,
height
,
width
,
stride
,
scale_xy
)
boxes
,
ind_mask
,
classes
=
tf
.
split
(
updates
,
[
4
,
1
,
1
],
axis
=-
1
)
width
=
tf
.
cast
(
width
,
tf
.
int32
)
height
=
tf
.
cast
(
height
,
tf
.
int32
)
full
=
tf
.
zeros
([
height
,
width
,
num_anchors
,
1
],
dtype
=
classes
.
dtype
)
full
=
tf
.
tensor_scatter_nd_add
(
full
,
centers
,
ind_mask
)
num_instances
=
int
(
self
.
num_instances
[
key
])
centers
=
preprocessing_ops
.
pad_max_instances
(
centers
,
num_instances
,
pad_value
=
0
,
pad_axis
=
0
)
updates
=
preprocessing_ops
.
pad_max_instances
(
updates
,
num_instances
,
pad_value
=
0
,
pad_axis
=
0
)
updates
=
tf
.
cast
(
updates
,
self
.
dtype
)
full
=
tf
.
cast
(
full
,
self
.
dtype
)
return
centers
,
updates
,
full
def
__call__
(
self
,
boxes
,
classes
,
width
,
height
):
"""Builds the labels for a single image, not functional in batch mode.
Args:
boxes: `Tensor` of shape [None, 4] indicating the object locations in an
image.
classes: `Tensor` of shape [None] indicating the each objects classes.
width: `int` for the images width.
height: `int` for the images height.
Returns:
centers: `Tensor` of shape [None, 3] of indexes in the final grid where
boxes are located.
updates: `Tensor` of shape [None, 8] the value to place in the final grid.
full: `Tensor` of [width/stride, height/stride, num_anchors, 1] holding
a mask of where boxes are locates for confidence losses.
"""
indexes
=
{}
updates
=
{}
true_grids
=
{}
iou_index
=
None
boxes
=
box_ops
.
yxyx_to_xcycwh
(
boxes
)
if
not
self
.
best_matches_only
and
self
.
anchor_free_level_limits
is
None
:
# stitch and search boxes across fpn levels
anchorsvec
=
[]
for
stitch
in
self
.
anchors
:
anchorsvec
.
extend
(
self
.
anchors
[
stitch
])
stride
=
tf
.
cast
([
width
,
height
],
boxes
.
dtype
)
# get the best anchor for each box
iou_index
,
_
=
get_best_anchor
(
boxes
,
anchorsvec
,
stride
,
width
=
1.0
,
height
=
1.0
,
best_match_only
=
False
,
use_tie_breaker
=
self
.
use_tie_breaker
,
iou_thresh
=
self
.
match_threshold
)
for
key
in
self
.
keys
:
indexes
[
key
],
updates
[
key
],
true_grids
[
key
]
=
self
.
build_label_per_path
(
key
,
boxes
,
classes
,
width
,
height
,
iou_index
=
iou_index
)
return
indexes
,
updates
,
true_grids
official/vision/beta/projects/yolo/ops/loss_utils.py
View file @
482823c8
...
...
@@ -13,6 +13,7 @@
# limitations under the License.
"""Yolo loss utility functions."""
import
numpy
as
np
import
tensorflow
as
tf
...
...
@@ -129,6 +130,10 @@ def build_grid(indexes, truths, preds, ind_mask, update=False, grid=None):
indexes
=
apply_mask
(
tf
.
cast
(
ind_mask
,
indexes
.
dtype
),
indexes
)
indexes
=
(
indexes
+
(
ind_mask
-
1
))
# mask truths
truths
=
apply_mask
(
tf
.
cast
(
ind_mask
,
truths
.
dtype
),
truths
)
truths
=
(
truths
+
(
tf
.
cast
(
ind_mask
,
truths
.
dtype
)
-
1
))
# reshape the indexes into the correct shape for the loss,
# just flatten all indexes but the last
indexes
=
tf
.
reshape
(
indexes
,
[
-
1
,
4
])
...
...
@@ -157,26 +162,16 @@ def build_grid(indexes, truths, preds, ind_mask, update=False, grid=None):
class
GridGenerator
:
"""Grid generator that generates anchor grids for box decoding."""
def
__init__
(
self
,
anchors
,
masks
=
None
,
scale_anchors
=
None
):
def
__init__
(
self
,
anchors
,
scale_anchors
=
None
):
"""Initialize Grid Generator.
Args:
anchors: A `List[List[int]]` for the anchor boxes that are used in the
model at all levels.
masks: A `List[int]` for the output level that this specific model output
Level.
scale_anchors: An `int` for how much to scale this level to get the
original input shape.
"""
self
.
dtype
=
tf
.
keras
.
backend
.
floatx
()
if
masks
is
not
None
:
self
.
_num
=
len
(
masks
)
else
:
self
.
_num
=
tf
.
shape
(
anchors
)[
0
]
if
masks
is
not
None
:
anchors
=
[
anchors
[
mask
]
for
mask
in
masks
]
self
.
_scale_anchors
=
scale_anchors
self
.
_anchors
=
tf
.
convert_to_tensor
(
anchors
)
return
...
...
@@ -331,18 +326,10 @@ class PairWiseSearch:
pred_classes
,
boxes
,
classes
,
scale
=
None
,
yxyx
=
True
,
clip_thresh
=
0.0
):
num_boxes
=
tf
.
shape
(
boxes
)[
-
2
]
num_tiles
=
(
num_boxes
//
TILE_SIZE
)
-
1
if
yxyx
:
boxes
=
box_ops
.
yxyx_to_xcycwh
(
boxes
)
if
scale
is
not
None
:
boxes
=
boxes
*
tf
.
stop_gradient
(
scale
)
if
self
.
_min_conf
>
0.0
:
pred_classes
=
tf
.
cast
(
pred_classes
>
self
.
_min_conf
,
pred_classes
.
dtype
)
...
...
@@ -535,32 +522,35 @@ def _darknet_new_coord_boxes(encoded_boxes, width, height, anchor_grid,
return
(
scaler
,
scaled_box
,
pred_box
),
delta
def
_anchor_free_scale_boxes
(
encoded_boxes
,
width
,
height
,
stride
,
grid_points
,
scale_xy
):
def
_anchor_free_scale_boxes
(
encoded_boxes
,
width
,
height
,
stride
,
grid_points
,
darknet
=
False
):
"""Decode models boxes using FPN stride under anchor free conditions."""
del
darknet
# split the boxes
pred_xy
=
encoded_boxes
[...,
0
:
2
]
pred_wh
=
encoded_boxes
[...,
2
:
4
]
# build a scaling tensor to get the offset of th ebox relative to the image
scaler
=
tf
.
convert_to_tensor
([
height
,
width
,
height
,
width
])
scale_xy
=
tf
.
cast
(
scale_xy
,
encoded_boxes
.
dtype
)
# scale the centers and find the offset of each box relative to
# their center pixel
pred_xy
=
pred_xy
*
scale_xy
-
0.5
*
(
scale_xy
-
1
)
# scale the offsets and add them to the grid points or a tensor that is
# the realtive location of each pixel
box_xy
=
(
grid_points
+
pred_xy
)
*
stride
box_xy
=
(
grid_points
+
pred_xy
)
# scale the width and height of the predictions and corlate them
# to anchor boxes
box_wh
=
tf
.
math
.
exp
(
pred_wh
)
*
stride
box_wh
=
tf
.
math
.
exp
(
pred_wh
)
# build the final predicted box
scaled_box
=
tf
.
concat
([
box_xy
,
box_wh
],
axis
=-
1
)
pred_box
=
scaled_box
/
scaler
# properly scaling boxes gradeints
scaled_box
=
scaled_box
*
tf
.
cast
(
stride
,
scaled_box
.
dtype
)
pred_box
=
scaled_box
/
tf
.
cast
(
scaler
*
stride
,
scaled_box
.
dtype
)
return
(
scaler
,
scaled_box
,
pred_box
)
...
...
@@ -608,9 +598,8 @@ def get_predicted_box(width,
range.
"""
if
box_type
==
'anchor_free'
:
(
scaler
,
scaled_box
,
pred_box
)
=
_anchor_free_scale_boxes
(
encoded_boxes
,
width
,
height
,
stride
,
grid_points
,
scale_xy
)
(
scaler
,
scaled_box
,
pred_box
)
=
_anchor_free_scale_boxes
(
encoded_boxes
,
width
,
height
,
stride
,
grid_points
,
darknet
=
darknet
)
elif
darknet
:
# pylint:disable=unbalanced-tuple-unpacking
...
...
official/vision/beta/projects/yolo/ops/mosaic.py
0 → 100755
View file @
482823c8
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Mosaic op."""
import
random
import
tensorflow
as
tf
import
tensorflow_addons
as
tfa
from
official.vision.beta.ops
import
box_ops
from
official.vision.beta.ops
import
preprocess_ops
from
official.vision.beta.projects.yolo.ops
import
preprocessing_ops
class
Mosaic
:
"""Stitch together sets of 4 images to generate samples with more boxes."""
def
__init__
(
self
,
output_size
,
mosaic_frequency
=
1.0
,
mixup_frequency
=
0.0
,
letter_box
=
True
,
jitter
=
0.0
,
mosaic_crop_mode
=
'scale'
,
mosaic_center
=
0.25
,
aug_scale_min
=
1.0
,
aug_scale_max
=
1.0
,
aug_rand_angle
=
0.0
,
aug_rand_perspective
=
0.0
,
aug_rand_translate
=
0.0
,
random_pad
=
False
,
random_flip
=
False
,
area_thresh
=
0.1
,
pad_value
=
preprocessing_ops
.
PAD_VALUE
,
seed
=
None
):
"""Initializes parameters for mosaic.
Args:
output_size: `Tensor` or `List` for [height, width] of output image.
mosaic_frequency: `float` indicating how often to apply mosaic.
mixup_frequency: `float` indicating how often to apply mixup.
letter_box: `boolean` indicating whether upon start of the datapipeline
regardless of the preprocessing ops that are used, the aspect ratio of
the images should be preserved.
jitter: `float` for the maximum change in aspect ratio expected in each
preprocessing step.
mosaic_crop_mode: `str` they type of mosaic to apply. The options are
{crop, scale, None}, crop will construct a mosaic by slicing images
togther, scale will create a mosaic by concatnating and shifting the
image, and None will default to scale and apply no post processing to
the created mosaic.
mosaic_center: `float` indicating how much to randomly deviate from the
from the center of the image when creating a mosaic.
aug_scale_min: `float` indicating the minimum scaling value for image
scale jitter.
aug_scale_max: `float` indicating the maximum scaling value for image
scale jitter.
aug_rand_angle: `float` indicating the maximum angle value for angle.
angle will be changes between 0 and value.
aug_rand_perspective: `float` ranging from 0.000 to 0.001 indicating how
much to prespective warp the image.
aug_rand_translate: `float` ranging from 0 to 1 indicating the maximum
amount to randomly translate an image.
random_pad: `bool` indiccating wether to use padding to apply random
translation true for darknet yolo false for scaled yolo.
random_flip: `bool` whether or not to random flip the image.
area_thresh: `float` for the minimum area of a box to allow to pass
through for optimization.
pad_value: `int` padding value.
seed: `int` the seed for random number generation.
"""
self
.
_output_size
=
output_size
self
.
_area_thresh
=
area_thresh
self
.
_mosaic_frequency
=
mosaic_frequency
self
.
_mixup_frequency
=
mixup_frequency
self
.
_letter_box
=
letter_box
self
.
_random_crop
=
jitter
self
.
_mosaic_crop_mode
=
mosaic_crop_mode
self
.
_mosaic_center
=
mosaic_center
self
.
_aug_scale_min
=
aug_scale_min
self
.
_aug_scale_max
=
aug_scale_max
self
.
_random_pad
=
random_pad
self
.
_aug_rand_translate
=
aug_rand_translate
self
.
_aug_rand_angle
=
aug_rand_angle
self
.
_aug_rand_perspective
=
aug_rand_perspective
self
.
_random_flip
=
random_flip
self
.
_pad_value
=
pad_value
self
.
_deterministic
=
seed
is
not
None
self
.
_seed
=
seed
if
seed
is
not
None
else
random
.
randint
(
0
,
2
**
30
)
def
_generate_cut
(
self
):
"""Generate a random center to use for slicing and patching the images."""
if
self
.
_mosaic_crop_mode
==
'crop'
:
min_offset
=
self
.
_mosaic_center
cut_x
=
preprocessing_ops
.
random_uniform_strong
(
self
.
_output_size
[
1
]
*
min_offset
,
self
.
_output_size
[
1
]
*
(
1
-
min_offset
),
seed
=
self
.
_seed
)
cut_y
=
preprocessing_ops
.
random_uniform_strong
(
self
.
_output_size
[
0
]
*
min_offset
,
self
.
_output_size
[
0
]
*
(
1
-
min_offset
),
seed
=
self
.
_seed
)
cut
=
[
cut_y
,
cut_x
]
ishape
=
tf
.
convert_to_tensor
(
[
self
.
_output_size
[
0
],
self
.
_output_size
[
1
],
3
])
else
:
cut
=
None
ishape
=
tf
.
convert_to_tensor
(
[
self
.
_output_size
[
0
]
*
2
,
self
.
_output_size
[
1
]
*
2
,
3
])
return
cut
,
ishape
def
scale_boxes
(
self
,
patch
,
ishape
,
boxes
,
classes
,
xs
,
ys
):
"""Scale and translate the boxes for each image prior to patching."""
xs
=
tf
.
cast
(
xs
,
boxes
.
dtype
)
ys
=
tf
.
cast
(
ys
,
boxes
.
dtype
)
pshape
=
tf
.
cast
(
tf
.
shape
(
patch
),
boxes
.
dtype
)
ishape
=
tf
.
cast
(
ishape
,
boxes
.
dtype
)
translate
=
tf
.
cast
((
ishape
-
pshape
),
boxes
.
dtype
)
boxes
=
box_ops
.
denormalize_boxes
(
boxes
,
pshape
[:
2
])
boxes
=
boxes
+
tf
.
cast
([
translate
[
0
]
*
ys
,
translate
[
1
]
*
xs
,
translate
[
0
]
*
ys
,
translate
[
1
]
*
xs
],
boxes
.
dtype
)
boxes
=
box_ops
.
normalize_boxes
(
boxes
,
ishape
[:
2
])
return
boxes
,
classes
def
_select_ind
(
self
,
inds
,
*
args
):
items
=
[]
for
item
in
args
:
items
.
append
(
tf
.
gather
(
item
,
inds
))
return
items
def
_augment_image
(
self
,
image
,
boxes
,
classes
,
is_crowd
,
area
,
xs
=
0.0
,
ys
=
0.0
,
cut
=
None
):
"""Process a single image prior to the application of patching."""
if
self
.
_random_flip
:
# Randomly flip the image horizontally.
image
,
boxes
,
_
=
preprocess_ops
.
random_horizontal_flip
(
image
,
boxes
,
seed
=
self
.
_seed
)
# Augment the image without resizing
image
,
infos
,
crop_points
=
preprocessing_ops
.
resize_and_jitter_image
(
image
,
[
self
.
_output_size
[
0
],
self
.
_output_size
[
1
]],
random_pad
=
False
,
letter_box
=
self
.
_letter_box
,
jitter
=
self
.
_random_crop
,
shiftx
=
xs
,
shifty
=
ys
,
cut
=
cut
,
seed
=
self
.
_seed
)
# Clip and clean boxes.
boxes
,
inds
=
preprocessing_ops
.
transform_and_clip_boxes
(
boxes
,
infos
,
area_thresh
=
self
.
_area_thresh
,
shuffle_boxes
=
False
,
augment
=
True
,
seed
=
self
.
_seed
)
classes
,
is_crowd
,
area
=
self
.
_select_ind
(
inds
,
classes
,
is_crowd
,
area
)
# pylint:disable=unbalanced-tuple-unpacking
return
image
,
boxes
,
classes
,
is_crowd
,
area
,
crop_points
def
_mosaic_crop_image
(
self
,
image
,
boxes
,
classes
,
is_crowd
,
area
):
"""Process a patched image in preperation for final output."""
if
self
.
_mosaic_crop_mode
!=
'crop'
:
shape
=
tf
.
cast
(
preprocessing_ops
.
get_image_shape
(
image
),
tf
.
float32
)
center
=
shape
*
self
.
_mosaic_center
# shift the center of the image by applying a translation to the whole
# image
ch
=
tf
.
math
.
round
(
preprocessing_ops
.
random_uniform_strong
(
-
center
[
0
],
center
[
0
],
seed
=
self
.
_seed
))
cw
=
tf
.
math
.
round
(
preprocessing_ops
.
random_uniform_strong
(
-
center
[
1
],
center
[
1
],
seed
=
self
.
_seed
))
# clip the boxes to those with in the image
image
=
tfa
.
image
.
translate
(
image
,
[
cw
,
ch
],
fill_value
=
self
.
_pad_value
)
boxes
=
box_ops
.
denormalize_boxes
(
boxes
,
shape
[:
2
])
boxes
=
boxes
+
tf
.
cast
([
ch
,
cw
,
ch
,
cw
],
boxes
.
dtype
)
boxes
=
box_ops
.
clip_boxes
(
boxes
,
shape
[:
2
])
inds
=
box_ops
.
get_non_empty_box_indices
(
boxes
)
boxes
=
box_ops
.
normalize_boxes
(
boxes
,
shape
[:
2
])
boxes
,
classes
,
is_crowd
,
area
=
self
.
_select_ind
(
inds
,
boxes
,
classes
,
# pylint:disable=unbalanced-tuple-unpacking
is_crowd
,
area
)
# warp and scale the fully stitched sample
image
,
_
,
affine
=
preprocessing_ops
.
affine_warp_image
(
image
,
[
self
.
_output_size
[
0
],
self
.
_output_size
[
1
]],
scale_min
=
self
.
_aug_scale_min
,
scale_max
=
self
.
_aug_scale_max
,
translate
=
self
.
_aug_rand_translate
,
degrees
=
self
.
_aug_rand_angle
,
perspective
=
self
.
_aug_rand_perspective
,
random_pad
=
self
.
_random_pad
,
seed
=
self
.
_seed
)
height
,
width
=
self
.
_output_size
[
0
],
self
.
_output_size
[
1
]
image
=
tf
.
image
.
resize
(
image
,
(
height
,
width
))
# clip and clean boxes
boxes
,
inds
=
preprocessing_ops
.
transform_and_clip_boxes
(
boxes
,
None
,
affine
=
affine
,
area_thresh
=
self
.
_area_thresh
,
seed
=
self
.
_seed
)
classes
,
is_crowd
,
area
=
self
.
_select_ind
(
inds
,
classes
,
is_crowd
,
area
)
# pylint:disable=unbalanced-tuple-unpacking
return
image
,
boxes
,
classes
,
is_crowd
,
area
,
area
# mosaic full frequency doubles model speed
def
_process_image
(
self
,
sample
,
shiftx
,
shifty
,
cut
,
ishape
):
"""Process and augment each image."""
(
image
,
boxes
,
classes
,
is_crowd
,
area
,
crop_points
)
=
self
.
_augment_image
(
sample
[
'image'
],
sample
[
'groundtruth_boxes'
],
sample
[
'groundtruth_classes'
],
sample
[
'groundtruth_is_crowd'
],
sample
[
'groundtruth_area'
],
shiftx
,
shifty
,
cut
)
(
boxes
,
classes
)
=
self
.
scale_boxes
(
image
,
ishape
,
boxes
,
classes
,
1
-
shiftx
,
1
-
shifty
)
sample
[
'image'
]
=
image
sample
[
'groundtruth_boxes'
]
=
boxes
sample
[
'groundtruth_classes'
]
=
classes
sample
[
'groundtruth_is_crowd'
]
=
is_crowd
sample
[
'groundtruth_area'
]
=
area
sample
[
'shiftx'
]
=
shiftx
sample
[
'shifty'
]
=
shifty
sample
[
'crop_points'
]
=
crop_points
return
sample
def
_patch2
(
self
,
one
,
two
):
"""Stitch together 2 images in totality."""
sample
=
one
sample
[
'image'
]
=
tf
.
concat
([
one
[
'image'
],
two
[
'image'
]],
axis
=-
2
)
sample
[
'groundtruth_boxes'
]
=
tf
.
concat
(
[
one
[
'groundtruth_boxes'
],
two
[
'groundtruth_boxes'
]],
axis
=
0
)
sample
[
'groundtruth_classes'
]
=
tf
.
concat
(
[
one
[
'groundtruth_classes'
],
two
[
'groundtruth_classes'
]],
axis
=
0
)
sample
[
'groundtruth_is_crowd'
]
=
tf
.
concat
(
[
one
[
'groundtruth_is_crowd'
],
two
[
'groundtruth_is_crowd'
]],
axis
=
0
)
sample
[
'groundtruth_area'
]
=
tf
.
concat
(
[
one
[
'groundtruth_area'
],
two
[
'groundtruth_area'
]],
axis
=
0
)
return
sample
def
_patch
(
self
,
one
,
two
):
"""Build the full 4 patch of images from sets of 2 images."""
image
=
tf
.
concat
([
one
[
'image'
],
two
[
'image'
]],
axis
=-
3
)
boxes
=
tf
.
concat
([
one
[
'groundtruth_boxes'
],
two
[
'groundtruth_boxes'
]],
axis
=
0
)
classes
=
tf
.
concat
(
[
one
[
'groundtruth_classes'
],
two
[
'groundtruth_classes'
]],
axis
=
0
)
is_crowd
=
tf
.
concat
(
[
one
[
'groundtruth_is_crowd'
],
two
[
'groundtruth_is_crowd'
]],
axis
=
0
)
area
=
tf
.
concat
([
one
[
'groundtruth_area'
],
two
[
'groundtruth_area'
]],
axis
=
0
)
if
self
.
_mosaic_crop_mode
is
not
None
:
image
,
boxes
,
classes
,
is_crowd
,
area
,
_
=
self
.
_mosaic_crop_image
(
image
,
boxes
,
classes
,
is_crowd
,
area
)
sample
=
one
height
,
width
=
preprocessing_ops
.
get_image_shape
(
image
)
sample
[
'image'
]
=
tf
.
cast
(
image
,
tf
.
uint8
)
sample
[
'groundtruth_boxes'
]
=
boxes
sample
[
'groundtruth_area'
]
=
area
sample
[
'groundtruth_classes'
]
=
tf
.
cast
(
classes
,
sample
[
'groundtruth_classes'
].
dtype
)
sample
[
'groundtruth_is_crowd'
]
=
tf
.
cast
(
is_crowd
,
tf
.
bool
)
sample
[
'width'
]
=
tf
.
cast
(
width
,
sample
[
'width'
].
dtype
)
sample
[
'height'
]
=
tf
.
cast
(
height
,
sample
[
'height'
].
dtype
)
sample
[
'num_detections'
]
=
tf
.
shape
(
sample
[
'groundtruth_boxes'
])[
1
]
sample
[
'is_mosaic'
]
=
tf
.
cast
(
1.0
,
tf
.
bool
)
del
sample
[
'shiftx'
]
del
sample
[
'shifty'
]
del
sample
[
'crop_points'
]
return
sample
def
_mosaic
(
self
,
one
,
two
,
three
,
four
):
"""Stitch together 4 images to build a mosaic."""
if
self
.
_mosaic_frequency
>=
1.0
:
domo
=
1.0
else
:
domo
=
preprocessing_ops
.
random_uniform_strong
(
0.0
,
1.0
,
dtype
=
tf
.
float32
,
seed
=
self
.
_seed
)
noop
=
one
.
copy
()
if
domo
>=
(
1
-
self
.
_mosaic_frequency
):
cut
,
ishape
=
self
.
_generate_cut
()
one
=
self
.
_process_image
(
one
,
1.0
,
1.0
,
cut
,
ishape
)
two
=
self
.
_process_image
(
two
,
0.0
,
1.0
,
cut
,
ishape
)
three
=
self
.
_process_image
(
three
,
1.0
,
0.0
,
cut
,
ishape
)
four
=
self
.
_process_image
(
four
,
0.0
,
0.0
,
cut
,
ishape
)
patch1
=
self
.
_patch2
(
one
,
two
)
patch2
=
self
.
_patch2
(
three
,
four
)
stitched
=
self
.
_patch
(
patch1
,
patch2
)
return
stitched
else
:
return
self
.
_add_param
(
noop
)
def
_mixup
(
self
,
one
,
two
):
"""Blend together 2 images for the mixup data augmentation."""
if
self
.
_mixup_frequency
>=
1.0
:
domo
=
1.0
else
:
domo
=
preprocessing_ops
.
random_uniform_strong
(
0.0
,
1.0
,
dtype
=
tf
.
float32
,
seed
=
self
.
_seed
)
noop
=
one
.
copy
()
if
domo
>=
(
1
-
self
.
_mixup_frequency
):
sample
=
one
otype
=
one
[
'image'
].
dtype
r
=
preprocessing_ops
.
random_uniform_strong
(
0.4
,
0.6
,
tf
.
float32
,
seed
=
self
.
_seed
)
sample
[
'image'
]
=
(
r
*
tf
.
cast
(
one
[
'image'
],
tf
.
float32
)
+
(
1
-
r
)
*
tf
.
cast
(
two
[
'image'
],
tf
.
float32
))
sample
[
'image'
]
=
tf
.
cast
(
sample
[
'image'
],
otype
)
sample
[
'groundtruth_boxes'
]
=
tf
.
concat
(
[
one
[
'groundtruth_boxes'
],
two
[
'groundtruth_boxes'
]],
axis
=
0
)
sample
[
'groundtruth_classes'
]
=
tf
.
concat
(
[
one
[
'groundtruth_classes'
],
two
[
'groundtruth_classes'
]],
axis
=
0
)
sample
[
'groundtruth_is_crowd'
]
=
tf
.
concat
(
[
one
[
'groundtruth_is_crowd'
],
two
[
'groundtruth_is_crowd'
]],
axis
=
0
)
sample
[
'groundtruth_area'
]
=
tf
.
concat
(
[
one
[
'groundtruth_area'
],
two
[
'groundtruth_area'
]],
axis
=
0
)
return
sample
else
:
return
self
.
_add_param
(
noop
)
def
_add_param
(
self
,
sample
):
"""Add parameters to handle skipped images."""
sample
[
'is_mosaic'
]
=
tf
.
cast
(
0.0
,
tf
.
bool
)
sample
[
'num_detections'
]
=
tf
.
shape
(
sample
[
'groundtruth_boxes'
])[
0
]
return
sample
def
_apply
(
self
,
dataset
):
"""Apply mosaic to an input dataset."""
determ
=
self
.
_deterministic
dataset
=
dataset
.
prefetch
(
tf
.
data
.
AUTOTUNE
)
one
=
dataset
.
shuffle
(
100
,
seed
=
self
.
_seed
,
reshuffle_each_iteration
=
True
)
two
=
dataset
.
shuffle
(
100
,
seed
=
self
.
_seed
+
1
,
reshuffle_each_iteration
=
True
)
three
=
dataset
.
shuffle
(
100
,
seed
=
self
.
_seed
+
2
,
reshuffle_each_iteration
=
True
)
four
=
dataset
.
shuffle
(
100
,
seed
=
self
.
_seed
+
3
,
reshuffle_each_iteration
=
True
)
dataset
=
tf
.
data
.
Dataset
.
zip
((
one
,
two
,
three
,
four
))
dataset
=
dataset
.
map
(
self
.
_mosaic
,
num_parallel_calls
=
tf
.
data
.
AUTOTUNE
,
deterministic
=
determ
)
if
self
.
_mixup_frequency
>
0
:
one
=
dataset
.
shuffle
(
100
,
seed
=
self
.
_seed
+
4
,
reshuffle_each_iteration
=
True
)
two
=
dataset
.
shuffle
(
100
,
seed
=
self
.
_seed
+
5
,
reshuffle_each_iteration
=
True
)
dataset
=
tf
.
data
.
Dataset
.
zip
((
one
,
two
))
dataset
=
dataset
.
map
(
self
.
_mixup
,
num_parallel_calls
=
tf
.
data
.
AUTOTUNE
,
deterministic
=
determ
)
return
dataset
def
_skip
(
self
,
dataset
):
"""Skip samples in a dataset."""
determ
=
self
.
_deterministic
return
dataset
.
map
(
self
.
_add_param
,
num_parallel_calls
=
tf
.
data
.
AUTOTUNE
,
deterministic
=
determ
)
def
mosaic_fn
(
self
,
is_training
=
True
):
"""Determine which function to apply based on whether model is training."""
if
is_training
and
self
.
_mosaic_frequency
>
0.0
:
return
self
.
_apply
else
:
return
self
.
_skip
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment