Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
textmonkey_pytorch
Commits
b1e6136c
Commit
b1e6136c
authored
Dec 26, 2023
by
yuluoyun
Browse files
data generation
parent
00946203
Changes
404
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
5057 additions
and
0 deletions
+5057
-0
data_generation/grit/third_party/CenterNet2/detectron2/export/caffe2_export.py
...third_party/CenterNet2/detectron2/export/caffe2_export.py
+207
-0
data_generation/grit/third_party/CenterNet2/detectron2/export/caffe2_inference.py
...rd_party/CenterNet2/detectron2/export/caffe2_inference.py
+161
-0
data_generation/grit/third_party/CenterNet2/detectron2/export/caffe2_modeling.py
...ird_party/CenterNet2/detectron2/export/caffe2_modeling.py
+419
-0
data_generation/grit/third_party/CenterNet2/detectron2/export/caffe2_patch.py
.../third_party/CenterNet2/detectron2/export/caffe2_patch.py
+152
-0
data_generation/grit/third_party/CenterNet2/detectron2/export/flatten.py
.../grit/third_party/CenterNet2/detectron2/export/flatten.py
+330
-0
data_generation/grit/third_party/CenterNet2/detectron2/export/shared.py
...n/grit/third_party/CenterNet2/detectron2/export/shared.py
+1034
-0
data_generation/grit/third_party/CenterNet2/detectron2/export/torchscript.py
...t/third_party/CenterNet2/detectron2/export/torchscript.py
+132
-0
data_generation/grit/third_party/CenterNet2/detectron2/export/torchscript_patch.py
...d_party/CenterNet2/detectron2/export/torchscript_patch.py
+406
-0
data_generation/grit/third_party/CenterNet2/detectron2/layers/__init__.py
...grit/third_party/CenterNet2/detectron2/layers/__init__.py
+24
-0
data_generation/grit/third_party/CenterNet2/detectron2/layers/aspp.py
...ion/grit/third_party/CenterNet2/detectron2/layers/aspp.py
+144
-0
data_generation/grit/third_party/CenterNet2/detectron2/layers/batch_norm.py
...it/third_party/CenterNet2/detectron2/layers/batch_norm.py
+276
-0
data_generation/grit/third_party/CenterNet2/detectron2/layers/blocks.py
...n/grit/third_party/CenterNet2/detectron2/layers/blocks.py
+111
-0
data_generation/grit/third_party/CenterNet2/detectron2/layers/csrc/README.md
...t/third_party/CenterNet2/detectron2/layers/csrc/README.md
+7
-0
data_generation/grit/third_party/CenterNet2/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h
.../detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h
+115
-0
data_generation/grit/third_party/CenterNet2/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp
...tron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp
+522
-0
data_generation/grit/third_party/CenterNet2/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu
...tron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu
+443
-0
data_generation/grit/third_party/CenterNet2/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
.../detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
+35
-0
data_generation/grit/third_party/CenterNet2/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
...tron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
+39
-0
data_generation/grit/third_party/CenterNet2/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
...tron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
+130
-0
data_generation/grit/third_party/CenterNet2/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
...tron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
+370
-0
No files found.
Too many changes to show.
To preserve performance only
404 of 404+
files are displayed.
Plain diff
Email patch
data_generation/grit/third_party/CenterNet2/detectron2/export/caffe2_export.py
0 → 100644
View file @
b1e6136c
# Copyright (c) Facebook, Inc. and its affiliates.
import
copy
import
io
import
logging
import
numpy
as
np
from
typing
import
List
import
onnx
import
torch
from
caffe2.proto
import
caffe2_pb2
from
caffe2.python
import
core
from
caffe2.python.onnx.backend
import
Caffe2Backend
from
tabulate
import
tabulate
from
termcolor
import
colored
from
torch.onnx
import
OperatorExportTypes
from
.shared
import
(
ScopedWS
,
construct_init_net_from_params
,
fuse_alias_placeholder
,
fuse_copy_between_cpu_and_gpu
,
get_params_from_init_net
,
group_norm_replace_aten_with_caffe2
,
infer_device_type
,
remove_dead_end_ops
,
remove_reshape_for_fc
,
save_graph
,
)
logger
=
logging
.
getLogger
(
__name__
)
def
export_onnx_model
(
model
,
inputs
):
"""
Trace and export a model to onnx format.
Args:
model (nn.Module):
inputs (tuple[args]): the model will be called by `model(*inputs)`
Returns:
an onnx model
"""
assert
isinstance
(
model
,
torch
.
nn
.
Module
)
# make sure all modules are in eval mode, onnx may change the training state
# of the module if the states are not consistent
def
_check_eval
(
module
):
assert
not
module
.
training
model
.
apply
(
_check_eval
)
# Export the model to ONNX
with
torch
.
no_grad
():
with
io
.
BytesIO
()
as
f
:
torch
.
onnx
.
export
(
model
,
inputs
,
f
,
operator_export_type
=
OperatorExportTypes
.
ONNX_ATEN_FALLBACK
,
# verbose=True, # NOTE: uncomment this for debugging
# export_params=True,
)
onnx_model
=
onnx
.
load_from_string
(
f
.
getvalue
())
# Apply ONNX's Optimization
all_passes
=
onnx
.
optimizer
.
get_available_passes
()
passes
=
[
"fuse_bn_into_conv"
]
assert
all
(
p
in
all_passes
for
p
in
passes
)
onnx_model
=
onnx
.
optimizer
.
optimize
(
onnx_model
,
passes
)
return
onnx_model
def
_op_stats
(
net_def
):
type_count
=
{}
for
t
in
[
op
.
type
for
op
in
net_def
.
op
]:
type_count
[
t
]
=
type_count
.
get
(
t
,
0
)
+
1
type_count_list
=
sorted
(
type_count
.
items
(),
key
=
lambda
kv
:
kv
[
0
])
# alphabet
type_count_list
=
sorted
(
type_count_list
,
key
=
lambda
kv
:
-
kv
[
1
])
# count
return
"
\n
"
.
join
(
"{:>4}x {}"
.
format
(
count
,
name
)
for
name
,
count
in
type_count_list
)
def
_assign_device_option
(
predict_net
:
caffe2_pb2
.
NetDef
,
init_net
:
caffe2_pb2
.
NetDef
,
tensor_inputs
:
List
[
torch
.
Tensor
]
):
"""
ONNX exported network doesn't have concept of device, assign necessary
device option for each op in order to make it runable on GPU runtime.
"""
def
_get_device_type
(
torch_tensor
):
assert
torch_tensor
.
device
.
type
in
[
"cpu"
,
"cuda"
]
assert
torch_tensor
.
device
.
index
==
0
return
torch_tensor
.
device
.
type
def
_assign_op_device_option
(
net_proto
,
net_ssa
,
blob_device_types
):
for
op
,
ssa_i
in
zip
(
net_proto
.
op
,
net_ssa
):
if
op
.
type
in
[
"CopyCPUToGPU"
,
"CopyGPUToCPU"
]:
op
.
device_option
.
CopyFrom
(
core
.
DeviceOption
(
caffe2_pb2
.
CUDA
,
0
))
else
:
devices
=
[
blob_device_types
[
b
]
for
b
in
ssa_i
[
0
]
+
ssa_i
[
1
]]
assert
all
(
d
==
devices
[
0
]
for
d
in
devices
)
if
devices
[
0
]
==
"cuda"
:
op
.
device_option
.
CopyFrom
(
core
.
DeviceOption
(
caffe2_pb2
.
CUDA
,
0
))
# update ops in predict_net
predict_net_input_device_types
=
{
(
name
,
0
):
_get_device_type
(
tensor
)
for
name
,
tensor
in
zip
(
predict_net
.
external_input
,
tensor_inputs
)
}
predict_net_device_types
=
infer_device_type
(
predict_net
,
known_status
=
predict_net_input_device_types
,
device_name_style
=
"pytorch"
)
predict_net_ssa
,
_
=
core
.
get_ssa
(
predict_net
)
_assign_op_device_option
(
predict_net
,
predict_net_ssa
,
predict_net_device_types
)
# update ops in init_net
init_net_ssa
,
versions
=
core
.
get_ssa
(
init_net
)
init_net_output_device_types
=
{
(
name
,
versions
[
name
]):
predict_net_device_types
[(
name
,
0
)]
for
name
in
init_net
.
external_output
}
init_net_device_types
=
infer_device_type
(
init_net
,
known_status
=
init_net_output_device_types
,
device_name_style
=
"pytorch"
)
_assign_op_device_option
(
init_net
,
init_net_ssa
,
init_net_device_types
)
def
export_caffe2_detection_model
(
model
:
torch
.
nn
.
Module
,
tensor_inputs
:
List
[
torch
.
Tensor
]):
"""
Export a caffe2-compatible Detectron2 model to caffe2 format via ONNX.
Arg:
model: a caffe2-compatible version of detectron2 model, defined in caffe2_modeling.py
tensor_inputs: a list of tensors that caffe2 model takes as input.
"""
model
=
copy
.
deepcopy
(
model
)
assert
isinstance
(
model
,
torch
.
nn
.
Module
)
assert
hasattr
(
model
,
"encode_additional_info"
)
# Export via ONNX
logger
.
info
(
"Exporting a {} model via ONNX ..."
.
format
(
type
(
model
).
__name__
)
+
" Some warnings from ONNX are expected and are usually not to worry about."
)
onnx_model
=
export_onnx_model
(
model
,
(
tensor_inputs
,))
# Convert ONNX model to Caffe2 protobuf
init_net
,
predict_net
=
Caffe2Backend
.
onnx_graph_to_caffe2_net
(
onnx_model
)
ops_table
=
[[
op
.
type
,
op
.
input
,
op
.
output
]
for
op
in
predict_net
.
op
]
table
=
tabulate
(
ops_table
,
headers
=
[
"type"
,
"input"
,
"output"
],
tablefmt
=
"pipe"
)
logger
.
info
(
"ONNX export Done. Exported predict_net (before optimizations):
\n
"
+
colored
(
table
,
"cyan"
)
)
# Apply protobuf optimization
fuse_alias_placeholder
(
predict_net
,
init_net
)
if
any
(
t
.
device
.
type
!=
"cpu"
for
t
in
tensor_inputs
):
fuse_copy_between_cpu_and_gpu
(
predict_net
)
remove_dead_end_ops
(
init_net
)
_assign_device_option
(
predict_net
,
init_net
,
tensor_inputs
)
params
,
device_options
=
get_params_from_init_net
(
init_net
)
predict_net
,
params
=
remove_reshape_for_fc
(
predict_net
,
params
)
init_net
=
construct_init_net_from_params
(
params
,
device_options
)
group_norm_replace_aten_with_caffe2
(
predict_net
)
# Record necessary information for running the pb model in Detectron2 system.
model
.
encode_additional_info
(
predict_net
,
init_net
)
logger
.
info
(
"Operators used in predict_net:
\n
{}"
.
format
(
_op_stats
(
predict_net
)))
logger
.
info
(
"Operators used in init_net:
\n
{}"
.
format
(
_op_stats
(
init_net
)))
return
predict_net
,
init_net
def
run_and_save_graph
(
predict_net
,
init_net
,
tensor_inputs
,
graph_save_path
):
"""
Run the caffe2 model on given inputs, recording the shape and draw the graph.
predict_net/init_net: caffe2 model.
tensor_inputs: a list of tensors that caffe2 model takes as input.
graph_save_path: path for saving graph of exported model.
"""
logger
.
info
(
"Saving graph of ONNX exported model to {} ..."
.
format
(
graph_save_path
))
save_graph
(
predict_net
,
graph_save_path
,
op_only
=
False
)
# Run the exported Caffe2 net
logger
.
info
(
"Running ONNX exported model ..."
)
with
ScopedWS
(
"__ws_tmp__"
,
True
)
as
ws
:
ws
.
RunNetOnce
(
init_net
)
initialized_blobs
=
set
(
ws
.
Blobs
())
uninitialized
=
[
inp
for
inp
in
predict_net
.
external_input
if
inp
not
in
initialized_blobs
]
for
name
,
blob
in
zip
(
uninitialized
,
tensor_inputs
):
ws
.
FeedBlob
(
name
,
blob
)
try
:
ws
.
RunNetOnce
(
predict_net
)
except
RuntimeError
as
e
:
logger
.
warning
(
"Encountered RuntimeError:
\n
{}"
.
format
(
str
(
e
)))
ws_blobs
=
{
b
:
ws
.
FetchBlob
(
b
)
for
b
in
ws
.
Blobs
()}
blob_sizes
=
{
b
:
ws_blobs
[
b
].
shape
for
b
in
ws_blobs
if
isinstance
(
ws_blobs
[
b
],
np
.
ndarray
)}
logger
.
info
(
"Saving graph with blob shapes to {} ..."
.
format
(
graph_save_path
))
save_graph
(
predict_net
,
graph_save_path
,
op_only
=
False
,
blob_sizes
=
blob_sizes
)
return
ws_blobs
data_generation/grit/third_party/CenterNet2/detectron2/export/caffe2_inference.py
0 → 100644
View file @
b1e6136c
# Copyright (c) Facebook, Inc. and its affiliates.
import
logging
import
numpy
as
np
from
itertools
import
count
import
torch
from
caffe2.proto
import
caffe2_pb2
from
caffe2.python
import
core
from
.caffe2_modeling
import
META_ARCH_CAFFE2_EXPORT_TYPE_MAP
,
convert_batched_inputs_to_c2_format
from
.shared
import
ScopedWS
,
get_pb_arg_vali
,
get_pb_arg_vals
,
infer_device_type
logger
=
logging
.
getLogger
(
__name__
)
# ===== ref: mobile-vision predictor's 'Caffe2Wrapper' class ======
class
ProtobufModel
(
torch
.
nn
.
Module
):
"""
Wrapper of a caffe2's protobuf model.
It works just like nn.Module, but running caffe2 under the hood.
Input/Output are tuple[tensor] that match the caffe2 net's external_input/output.
"""
_ids
=
count
(
0
)
def
__init__
(
self
,
predict_net
,
init_net
):
logger
.
info
(
f
"Initializing ProtobufModel for:
{
predict_net
.
name
}
..."
)
super
().
__init__
()
assert
isinstance
(
predict_net
,
caffe2_pb2
.
NetDef
)
assert
isinstance
(
init_net
,
caffe2_pb2
.
NetDef
)
# create unique temporary workspace for each instance
self
.
ws_name
=
"__tmp_ProtobufModel_{}__"
.
format
(
next
(
self
.
_ids
))
self
.
net
=
core
.
Net
(
predict_net
)
logger
.
info
(
"Running init_net once to fill the parameters ..."
)
with
ScopedWS
(
self
.
ws_name
,
is_reset
=
True
,
is_cleanup
=
False
)
as
ws
:
ws
.
RunNetOnce
(
init_net
)
uninitialized_external_input
=
[]
for
blob
in
self
.
net
.
Proto
().
external_input
:
if
blob
not
in
ws
.
Blobs
():
uninitialized_external_input
.
append
(
blob
)
ws
.
CreateBlob
(
blob
)
ws
.
CreateNet
(
self
.
net
)
self
.
_error_msgs
=
set
()
self
.
_input_blobs
=
uninitialized_external_input
def
_infer_output_devices
(
self
,
inputs
):
"""
Returns:
list[str]: list of device for each external output
"""
def
_get_device_type
(
torch_tensor
):
assert
torch_tensor
.
device
.
type
in
[
"cpu"
,
"cuda"
]
assert
torch_tensor
.
device
.
index
==
0
return
torch_tensor
.
device
.
type
predict_net
=
self
.
net
.
Proto
()
input_device_types
=
{
(
name
,
0
):
_get_device_type
(
tensor
)
for
name
,
tensor
in
zip
(
self
.
_input_blobs
,
inputs
)
}
device_type_map
=
infer_device_type
(
predict_net
,
known_status
=
input_device_types
,
device_name_style
=
"pytorch"
)
ssa
,
versions
=
core
.
get_ssa
(
predict_net
)
versioned_outputs
=
[(
name
,
versions
[
name
])
for
name
in
predict_net
.
external_output
]
output_devices
=
[
device_type_map
[
outp
]
for
outp
in
versioned_outputs
]
return
output_devices
def
forward
(
self
,
inputs
):
"""
Args:
inputs (tuple[torch.Tensor])
Returns:
tuple[torch.Tensor]
"""
assert
len
(
inputs
)
==
len
(
self
.
_input_blobs
),
(
f
"Length of inputs (
{
len
(
inputs
)
}
) "
f
"doesn't match the required input blobs:
{
self
.
_input_blobs
}
"
)
with
ScopedWS
(
self
.
ws_name
,
is_reset
=
False
,
is_cleanup
=
False
)
as
ws
:
for
b
,
tensor
in
zip
(
self
.
_input_blobs
,
inputs
):
ws
.
FeedBlob
(
b
,
tensor
)
try
:
ws
.
RunNet
(
self
.
net
.
Proto
().
name
)
except
RuntimeError
as
e
:
if
not
str
(
e
)
in
self
.
_error_msgs
:
self
.
_error_msgs
.
add
(
str
(
e
))
logger
.
warning
(
"Encountered new RuntimeError:
\n
{}"
.
format
(
str
(
e
)))
logger
.
warning
(
"Catch the error and use partial results."
)
c2_outputs
=
[
ws
.
FetchBlob
(
b
)
for
b
in
self
.
net
.
Proto
().
external_output
]
# Remove outputs of current run, this is necessary in order to
# prevent fetching the result from previous run if the model fails
# in the middle.
for
b
in
self
.
net
.
Proto
().
external_output
:
# Needs to create uninitialized blob to make the net runable.
# This is "equivalent" to: ws.RemoveBlob(b) then ws.CreateBlob(b),
# but there'no such API.
ws
.
FeedBlob
(
b
,
f
"
{
b
}
, a C++ native class of type nullptr (uninitialized)."
)
# Cast output to torch.Tensor on the desired device
output_devices
=
(
self
.
_infer_output_devices
(
inputs
)
if
any
(
t
.
device
.
type
!=
"cpu"
for
t
in
inputs
)
else
[
"cpu"
for
_
in
self
.
net
.
Proto
().
external_output
]
)
outputs
=
[]
for
name
,
c2_output
,
device
in
zip
(
self
.
net
.
Proto
().
external_output
,
c2_outputs
,
output_devices
):
if
not
isinstance
(
c2_output
,
np
.
ndarray
):
raise
RuntimeError
(
"Invalid output for blob {}, received: {}"
.
format
(
name
,
c2_output
)
)
outputs
.
append
(
torch
.
tensor
(
c2_output
).
to
(
device
=
device
))
return
tuple
(
outputs
)
class
ProtobufDetectionModel
(
torch
.
nn
.
Module
):
"""
A class works just like a pytorch meta arch in terms of inference, but running
caffe2 model under the hood.
"""
def
__init__
(
self
,
predict_net
,
init_net
,
*
,
convert_outputs
=
None
):
"""
Args:
predict_net, init_net (core.Net): caffe2 nets
convert_outptus (callable): a function that converts caffe2
outputs to the same format of the original pytorch model.
By default, use the one defined in the caffe2 meta_arch.
"""
super
().
__init__
()
self
.
protobuf_model
=
ProtobufModel
(
predict_net
,
init_net
)
self
.
size_divisibility
=
get_pb_arg_vali
(
predict_net
,
"size_divisibility"
,
0
)
self
.
device
=
get_pb_arg_vals
(
predict_net
,
"device"
,
b
"cpu"
).
decode
(
"ascii"
)
if
convert_outputs
is
None
:
meta_arch
=
get_pb_arg_vals
(
predict_net
,
"meta_architecture"
,
b
"GeneralizedRCNN"
)
meta_arch
=
META_ARCH_CAFFE2_EXPORT_TYPE_MAP
[
meta_arch
.
decode
(
"ascii"
)]
self
.
_convert_outputs
=
meta_arch
.
get_outputs_converter
(
predict_net
,
init_net
)
else
:
self
.
_convert_outputs
=
convert_outputs
def
_convert_inputs
(
self
,
batched_inputs
):
# currently all models convert inputs in the same way
return
convert_batched_inputs_to_c2_format
(
batched_inputs
,
self
.
size_divisibility
,
self
.
device
)
def
forward
(
self
,
batched_inputs
):
c2_inputs
=
self
.
_convert_inputs
(
batched_inputs
)
c2_results
=
self
.
protobuf_model
(
c2_inputs
)
c2_results
=
dict
(
zip
(
self
.
protobuf_model
.
net
.
Proto
().
external_output
,
c2_results
))
return
self
.
_convert_outputs
(
batched_inputs
,
c2_inputs
,
c2_results
)
data_generation/grit/third_party/CenterNet2/detectron2/export/caffe2_modeling.py
0 → 100644
View file @
b1e6136c
# Copyright (c) Facebook, Inc. and its affiliates.
import
functools
import
io
import
struct
import
types
import
torch
from
detectron2.modeling
import
meta_arch
from
detectron2.modeling.box_regression
import
Box2BoxTransform
from
detectron2.modeling.roi_heads
import
keypoint_head
from
detectron2.structures
import
Boxes
,
ImageList
,
Instances
,
RotatedBoxes
from
.c10
import
Caffe2Compatible
from
.caffe2_patch
import
ROIHeadsPatcher
,
patch_generalized_rcnn
from
.shared
import
(
alias
,
check_set_pb_arg
,
get_pb_arg_floats
,
get_pb_arg_valf
,
get_pb_arg_vali
,
get_pb_arg_vals
,
mock_torch_nn_functional_interpolate
,
)
def
assemble_rcnn_outputs_by_name
(
image_sizes
,
tensor_outputs
,
force_mask_on
=
False
):
"""
A function to assemble caffe2 model's outputs (i.e. Dict[str, Tensor])
to detectron2's format (i.e. list of Instances instance).
This only works when the model follows the Caffe2 detectron's naming convention.
Args:
image_sizes (List[List[int, int]]): [H, W] of every image.
tensor_outputs (Dict[str, Tensor]): external_output to its tensor.
force_mask_on (Bool): if true, the it make sure there'll be pred_masks even
if the mask is not found from tensor_outputs (usually due to model crash)
"""
results
=
[
Instances
(
image_size
)
for
image_size
in
image_sizes
]
batch_splits
=
tensor_outputs
.
get
(
"batch_splits"
,
None
)
if
batch_splits
:
raise
NotImplementedError
()
assert
len
(
image_sizes
)
==
1
result
=
results
[
0
]
bbox_nms
=
tensor_outputs
[
"bbox_nms"
]
score_nms
=
tensor_outputs
[
"score_nms"
]
class_nms
=
tensor_outputs
[
"class_nms"
]
# Detection will always success because Conv support 0-batch
assert
bbox_nms
is
not
None
assert
score_nms
is
not
None
assert
class_nms
is
not
None
if
bbox_nms
.
shape
[
1
]
==
5
:
result
.
pred_boxes
=
RotatedBoxes
(
bbox_nms
)
else
:
result
.
pred_boxes
=
Boxes
(
bbox_nms
)
result
.
scores
=
score_nms
result
.
pred_classes
=
class_nms
.
to
(
torch
.
int64
)
mask_fcn_probs
=
tensor_outputs
.
get
(
"mask_fcn_probs"
,
None
)
if
mask_fcn_probs
is
not
None
:
# finish the mask pred
mask_probs_pred
=
mask_fcn_probs
num_masks
=
mask_probs_pred
.
shape
[
0
]
class_pred
=
result
.
pred_classes
indices
=
torch
.
arange
(
num_masks
,
device
=
class_pred
.
device
)
mask_probs_pred
=
mask_probs_pred
[
indices
,
class_pred
][:,
None
]
result
.
pred_masks
=
mask_probs_pred
elif
force_mask_on
:
# NOTE: there's no way to know the height/width of mask here, it won't be
# used anyway when batch size is 0, so just set them to 0.
result
.
pred_masks
=
torch
.
zeros
([
0
,
1
,
0
,
0
],
dtype
=
torch
.
uint8
)
keypoints_out
=
tensor_outputs
.
get
(
"keypoints_out"
,
None
)
kps_score
=
tensor_outputs
.
get
(
"kps_score"
,
None
)
if
keypoints_out
is
not
None
:
# keypoints_out: [N, 4, #kypoints], where 4 is in order of (x, y, score, prob)
keypoints_tensor
=
keypoints_out
# NOTE: it's possible that prob is not calculated if "should_output_softmax"
# is set to False in HeatmapMaxKeypoint, so just using raw score, seems
# it doesn't affect mAP. TODO: check more carefully.
keypoint_xyp
=
keypoints_tensor
.
transpose
(
1
,
2
)[:,
:,
[
0
,
1
,
2
]]
result
.
pred_keypoints
=
keypoint_xyp
elif
kps_score
is
not
None
:
# keypoint heatmap to sparse data structure
pred_keypoint_logits
=
kps_score
keypoint_head
.
keypoint_rcnn_inference
(
pred_keypoint_logits
,
[
result
])
return
results
def
_cast_to_f32
(
f64
):
return
struct
.
unpack
(
"f"
,
struct
.
pack
(
"f"
,
f64
))[
0
]
def
set_caffe2_compatible_tensor_mode
(
model
,
enable
=
True
):
def
_fn
(
m
):
if
isinstance
(
m
,
Caffe2Compatible
):
m
.
tensor_mode
=
enable
model
.
apply
(
_fn
)
def
convert_batched_inputs_to_c2_format
(
batched_inputs
,
size_divisibility
,
device
):
"""
See get_caffe2_inputs() below.
"""
assert
all
(
isinstance
(
x
,
dict
)
for
x
in
batched_inputs
)
assert
all
(
x
[
"image"
].
dim
()
==
3
for
x
in
batched_inputs
)
images
=
[
x
[
"image"
]
for
x
in
batched_inputs
]
images
=
ImageList
.
from_tensors
(
images
,
size_divisibility
)
im_info
=
[]
for
input_per_image
,
image_size
in
zip
(
batched_inputs
,
images
.
image_sizes
):
target_height
=
input_per_image
.
get
(
"height"
,
image_size
[
0
])
target_width
=
input_per_image
.
get
(
"width"
,
image_size
[
1
])
# noqa
# NOTE: The scale inside im_info is kept as convention and for providing
# post-processing information if further processing is needed. For
# current Caffe2 model definitions that don't include post-processing inside
# the model, this number is not used.
# NOTE: There can be a slight difference between width and height
# scales, using a single number can results in numerical difference
# compared with D2's post-processing.
scale
=
target_height
/
image_size
[
0
]
im_info
.
append
([
image_size
[
0
],
image_size
[
1
],
scale
])
im_info
=
torch
.
Tensor
(
im_info
)
return
images
.
tensor
.
to
(
device
),
im_info
.
to
(
device
)
class
Caffe2MetaArch
(
Caffe2Compatible
,
torch
.
nn
.
Module
):
"""
Base class for caffe2-compatible implementation of a meta architecture.
The forward is traceable and its traced graph can be converted to caffe2
graph through ONNX.
"""
def
__init__
(
self
,
cfg
,
torch_model
):
"""
Args:
cfg (CfgNode):
torch_model (nn.Module): the detectron2 model (meta_arch) to be
converted.
"""
super
().
__init__
()
self
.
_wrapped_model
=
torch_model
self
.
eval
()
set_caffe2_compatible_tensor_mode
(
self
,
True
)
def
get_caffe2_inputs
(
self
,
batched_inputs
):
"""
Convert pytorch-style structured inputs to caffe2-style inputs that
are tuples of tensors.
Args:
batched_inputs (list[dict]): inputs to a detectron2 model
in its standard format. Each dict has "image" (CHW tensor), and optionally
"height" and "width".
Returns:
tuple[Tensor]:
tuple of tensors that will be the inputs to the
:meth:`forward` method. For existing models, the first
is an NCHW tensor (padded and batched); the second is
a im_info Nx3 tensor, where the rows are
(height, width, unused legacy parameter)
"""
return
convert_batched_inputs_to_c2_format
(
batched_inputs
,
self
.
_wrapped_model
.
backbone
.
size_divisibility
,
self
.
_wrapped_model
.
device
,
)
def
encode_additional_info
(
self
,
predict_net
,
init_net
):
"""
Save extra metadata that will be used by inference in the output protobuf.
"""
pass
def
forward
(
self
,
inputs
):
"""
Run the forward in caffe2-style. It has to use caffe2-compatible ops
and the method will be used for tracing.
Args:
inputs (tuple[Tensor]): inputs defined by :meth:`get_caffe2_input`.
They will be the inputs of the converted caffe2 graph.
Returns:
tuple[Tensor]: output tensors. They will be the outputs of the
converted caffe2 graph.
"""
raise
NotImplementedError
def
_caffe2_preprocess_image
(
self
,
inputs
):
"""
Caffe2 implementation of preprocess_image, which is called inside each MetaArch's forward.
It normalizes the input images, and the final caffe2 graph assumes the
inputs have been batched already.
"""
data
,
im_info
=
inputs
data
=
alias
(
data
,
"data"
)
im_info
=
alias
(
im_info
,
"im_info"
)
mean
,
std
=
self
.
_wrapped_model
.
pixel_mean
,
self
.
_wrapped_model
.
pixel_std
normalized_data
=
(
data
-
mean
)
/
std
normalized_data
=
alias
(
normalized_data
,
"normalized_data"
)
# Pack (data, im_info) into ImageList which is recognized by self.inference.
images
=
ImageList
(
tensor
=
normalized_data
,
image_sizes
=
im_info
)
return
images
@
staticmethod
def
get_outputs_converter
(
predict_net
,
init_net
):
"""
Creates a function that converts outputs of the caffe2 model to
detectron2's standard format.
The function uses information in `predict_net` and `init_net` that are
available at inferene time. Therefore the function logic can be used in inference.
The returned function has the following signature:
def convert(batched_inputs, c2_inputs, c2_results) -> detectron2_outputs
Where
* batched_inputs (list[dict]): the original input format of the meta arch
* c2_inputs (tuple[Tensor]): the caffe2 inputs.
* c2_results (dict[str, Tensor]): the caffe2 output format,
corresponding to the outputs of the :meth:`forward` function.
* detectron2_outputs: the original output format of the meta arch.
This function can be used to compare the outputs of the original meta arch and
the converted caffe2 graph.
Returns:
callable: a callable of the above signature.
"""
raise
NotImplementedError
class
Caffe2GeneralizedRCNN
(
Caffe2MetaArch
):
def
__init__
(
self
,
cfg
,
torch_model
):
assert
isinstance
(
torch_model
,
meta_arch
.
GeneralizedRCNN
)
torch_model
=
patch_generalized_rcnn
(
torch_model
)
super
().
__init__
(
cfg
,
torch_model
)
try
:
use_heatmap_max_keypoint
=
cfg
.
EXPORT_CAFFE2
.
USE_HEATMAP_MAX_KEYPOINT
except
AttributeError
:
use_heatmap_max_keypoint
=
False
self
.
roi_heads_patcher
=
ROIHeadsPatcher
(
self
.
_wrapped_model
.
roi_heads
,
use_heatmap_max_keypoint
)
def
encode_additional_info
(
self
,
predict_net
,
init_net
):
size_divisibility
=
self
.
_wrapped_model
.
backbone
.
size_divisibility
check_set_pb_arg
(
predict_net
,
"size_divisibility"
,
"i"
,
size_divisibility
)
check_set_pb_arg
(
predict_net
,
"device"
,
"s"
,
str
.
encode
(
str
(
self
.
_wrapped_model
.
device
),
"ascii"
)
)
check_set_pb_arg
(
predict_net
,
"meta_architecture"
,
"s"
,
b
"GeneralizedRCNN"
)
@
mock_torch_nn_functional_interpolate
()
def
forward
(
self
,
inputs
):
if
not
self
.
tensor_mode
:
return
self
.
_wrapped_model
.
inference
(
inputs
)
images
=
self
.
_caffe2_preprocess_image
(
inputs
)
features
=
self
.
_wrapped_model
.
backbone
(
images
.
tensor
)
proposals
,
_
=
self
.
_wrapped_model
.
proposal_generator
(
images
,
features
)
with
self
.
roi_heads_patcher
.
mock_roi_heads
():
detector_results
,
_
=
self
.
_wrapped_model
.
roi_heads
(
images
,
features
,
proposals
)
return
tuple
(
detector_results
[
0
].
flatten
())
@
staticmethod
def
get_outputs_converter
(
predict_net
,
init_net
):
def
f
(
batched_inputs
,
c2_inputs
,
c2_results
):
_
,
im_info
=
c2_inputs
image_sizes
=
[[
int
(
im
[
0
]),
int
(
im
[
1
])]
for
im
in
im_info
]
results
=
assemble_rcnn_outputs_by_name
(
image_sizes
,
c2_results
)
return
meta_arch
.
GeneralizedRCNN
.
_postprocess
(
results
,
batched_inputs
,
image_sizes
)
return
f
class
Caffe2RetinaNet
(
Caffe2MetaArch
):
def
__init__
(
self
,
cfg
,
torch_model
):
assert
isinstance
(
torch_model
,
meta_arch
.
RetinaNet
)
super
().
__init__
(
cfg
,
torch_model
)
@
mock_torch_nn_functional_interpolate
()
def
forward
(
self
,
inputs
):
assert
self
.
tensor_mode
images
=
self
.
_caffe2_preprocess_image
(
inputs
)
# explicitly return the images sizes to avoid removing "im_info" by ONNX
# since it's not used in the forward path
return_tensors
=
[
images
.
image_sizes
]
features
=
self
.
_wrapped_model
.
backbone
(
images
.
tensor
)
features
=
[
features
[
f
]
for
f
in
self
.
_wrapped_model
.
head_in_features
]
for
i
,
feature_i
in
enumerate
(
features
):
features
[
i
]
=
alias
(
feature_i
,
"feature_{}"
.
format
(
i
),
is_backward
=
True
)
return_tensors
.
append
(
features
[
i
])
pred_logits
,
pred_anchor_deltas
=
self
.
_wrapped_model
.
head
(
features
)
for
i
,
(
box_cls_i
,
box_delta_i
)
in
enumerate
(
zip
(
pred_logits
,
pred_anchor_deltas
)):
return_tensors
.
append
(
alias
(
box_cls_i
,
"box_cls_{}"
.
format
(
i
)))
return_tensors
.
append
(
alias
(
box_delta_i
,
"box_delta_{}"
.
format
(
i
)))
return
tuple
(
return_tensors
)
def
encode_additional_info
(
self
,
predict_net
,
init_net
):
size_divisibility
=
self
.
_wrapped_model
.
backbone
.
size_divisibility
check_set_pb_arg
(
predict_net
,
"size_divisibility"
,
"i"
,
size_divisibility
)
check_set_pb_arg
(
predict_net
,
"device"
,
"s"
,
str
.
encode
(
str
(
self
.
_wrapped_model
.
device
),
"ascii"
)
)
check_set_pb_arg
(
predict_net
,
"meta_architecture"
,
"s"
,
b
"RetinaNet"
)
# Inference parameters:
check_set_pb_arg
(
predict_net
,
"score_threshold"
,
"f"
,
_cast_to_f32
(
self
.
_wrapped_model
.
test_score_thresh
)
)
check_set_pb_arg
(
predict_net
,
"topk_candidates"
,
"i"
,
self
.
_wrapped_model
.
test_topk_candidates
)
check_set_pb_arg
(
predict_net
,
"nms_threshold"
,
"f"
,
_cast_to_f32
(
self
.
_wrapped_model
.
test_nms_thresh
)
)
check_set_pb_arg
(
predict_net
,
"max_detections_per_image"
,
"i"
,
self
.
_wrapped_model
.
max_detections_per_image
,
)
check_set_pb_arg
(
predict_net
,
"bbox_reg_weights"
,
"floats"
,
[
_cast_to_f32
(
w
)
for
w
in
self
.
_wrapped_model
.
box2box_transform
.
weights
],
)
self
.
_encode_anchor_generator_cfg
(
predict_net
)
def
_encode_anchor_generator_cfg
(
self
,
predict_net
):
# serialize anchor_generator for future use
serialized_anchor_generator
=
io
.
BytesIO
()
torch
.
save
(
self
.
_wrapped_model
.
anchor_generator
,
serialized_anchor_generator
)
# Ideally we can put anchor generating inside the model, then we don't
# need to store this information.
bytes
=
serialized_anchor_generator
.
getvalue
()
check_set_pb_arg
(
predict_net
,
"serialized_anchor_generator"
,
"s"
,
bytes
)
@
staticmethod
def
get_outputs_converter
(
predict_net
,
init_net
):
self
=
types
.
SimpleNamespace
()
serialized_anchor_generator
=
io
.
BytesIO
(
get_pb_arg_vals
(
predict_net
,
"serialized_anchor_generator"
,
None
)
)
self
.
anchor_generator
=
torch
.
load
(
serialized_anchor_generator
)
bbox_reg_weights
=
get_pb_arg_floats
(
predict_net
,
"bbox_reg_weights"
,
None
)
self
.
box2box_transform
=
Box2BoxTransform
(
weights
=
tuple
(
bbox_reg_weights
))
self
.
test_score_thresh
=
get_pb_arg_valf
(
predict_net
,
"score_threshold"
,
None
)
self
.
test_topk_candidates
=
get_pb_arg_vali
(
predict_net
,
"topk_candidates"
,
None
)
self
.
test_nms_thresh
=
get_pb_arg_valf
(
predict_net
,
"nms_threshold"
,
None
)
self
.
max_detections_per_image
=
get_pb_arg_vali
(
predict_net
,
"max_detections_per_image"
,
None
)
# hack to reuse inference code from RetinaNet
for
meth
in
[
"forward_inference"
,
"inference_single_image"
,
"_transpose_dense_predictions"
,
"_decode_multi_level_predictions"
,
"_decode_per_level_predictions"
,
]:
setattr
(
self
,
meth
,
functools
.
partial
(
getattr
(
meta_arch
.
RetinaNet
,
meth
),
self
))
def
f
(
batched_inputs
,
c2_inputs
,
c2_results
):
_
,
im_info
=
c2_inputs
image_sizes
=
[[
int
(
im
[
0
]),
int
(
im
[
1
])]
for
im
in
im_info
]
dummy_images
=
ImageList
(
torch
.
randn
(
(
len
(
im_info
),
3
,
)
+
tuple
(
image_sizes
[
0
])
),
image_sizes
,
)
num_features
=
len
([
x
for
x
in
c2_results
.
keys
()
if
x
.
startswith
(
"box_cls_"
)])
pred_logits
=
[
c2_results
[
"box_cls_{}"
.
format
(
i
)]
for
i
in
range
(
num_features
)]
pred_anchor_deltas
=
[
c2_results
[
"box_delta_{}"
.
format
(
i
)]
for
i
in
range
(
num_features
)]
# For each feature level, feature should have the same batch size and
# spatial dimension as the box_cls and box_delta.
dummy_features
=
[
x
.
clone
()[:,
0
:
0
,
:,
:]
for
x
in
pred_logits
]
# self.num_classess can be inferred
self
.
num_classes
=
pred_logits
[
0
].
shape
[
1
]
//
(
pred_anchor_deltas
[
0
].
shape
[
1
]
//
4
)
results
=
self
.
forward_inference
(
dummy_images
,
dummy_features
,
[
pred_logits
,
pred_anchor_deltas
]
)
return
meta_arch
.
GeneralizedRCNN
.
_postprocess
(
results
,
batched_inputs
,
image_sizes
)
return
f
META_ARCH_CAFFE2_EXPORT_TYPE_MAP
=
{
"GeneralizedRCNN"
:
Caffe2GeneralizedRCNN
,
"RetinaNet"
:
Caffe2RetinaNet
,
}
data_generation/grit/third_party/CenterNet2/detectron2/export/caffe2_patch.py
0 → 100644
View file @
b1e6136c
# Copyright (c) Facebook, Inc. and its affiliates.
import
contextlib
from
unittest
import
mock
import
torch
from
detectron2.modeling
import
poolers
from
detectron2.modeling.proposal_generator
import
rpn
from
detectron2.modeling.roi_heads
import
keypoint_head
,
mask_head
from
detectron2.modeling.roi_heads.fast_rcnn
import
FastRCNNOutputLayers
from
.c10
import
(
Caffe2Compatible
,
Caffe2FastRCNNOutputsInference
,
Caffe2KeypointRCNNInference
,
Caffe2MaskRCNNInference
,
Caffe2ROIPooler
,
Caffe2RPN
,
)
class
GenericMixin
(
object
):
pass
class
Caffe2CompatibleConverter
(
object
):
"""
A GenericUpdater which implements the `create_from` interface, by modifying
module object and assign it with another class replaceCls.
"""
def
__init__
(
self
,
replaceCls
):
self
.
replaceCls
=
replaceCls
def
create_from
(
self
,
module
):
# update module's class to the new class
assert
isinstance
(
module
,
torch
.
nn
.
Module
)
if
issubclass
(
self
.
replaceCls
,
GenericMixin
):
# replaceCls should act as mixin, create a new class on-the-fly
new_class
=
type
(
"{}MixedWith{}"
.
format
(
self
.
replaceCls
.
__name__
,
module
.
__class__
.
__name__
),
(
self
.
replaceCls
,
module
.
__class__
),
{},
# {"new_method": lambda self: ...},
)
module
.
__class__
=
new_class
else
:
# replaceCls is complete class, this allow arbitrary class swap
module
.
__class__
=
self
.
replaceCls
# initialize Caffe2Compatible
if
isinstance
(
module
,
Caffe2Compatible
):
module
.
tensor_mode
=
False
return
module
def
patch
(
model
,
target
,
updater
,
*
args
,
**
kwargs
):
"""
recursively (post-order) update all modules with the target type and its
subclasses, make a initialization/composition/inheritance/... via the
updater.create_from.
"""
for
name
,
module
in
model
.
named_children
():
model
.
_modules
[
name
]
=
patch
(
module
,
target
,
updater
,
*
args
,
**
kwargs
)
if
isinstance
(
model
,
target
):
return
updater
.
create_from
(
model
,
*
args
,
**
kwargs
)
return
model
def
patch_generalized_rcnn
(
model
):
ccc
=
Caffe2CompatibleConverter
model
=
patch
(
model
,
rpn
.
RPN
,
ccc
(
Caffe2RPN
))
model
=
patch
(
model
,
poolers
.
ROIPooler
,
ccc
(
Caffe2ROIPooler
))
return
model
@
contextlib
.
contextmanager
def
mock_fastrcnn_outputs_inference
(
tensor_mode
,
check
=
True
,
box_predictor_type
=
FastRCNNOutputLayers
):
with
mock
.
patch
.
object
(
box_predictor_type
,
"inference"
,
autospec
=
True
,
side_effect
=
Caffe2FastRCNNOutputsInference
(
tensor_mode
),
)
as
mocked_func
:
yield
if
check
:
assert
mocked_func
.
call_count
>
0
@
contextlib
.
contextmanager
def
mock_mask_rcnn_inference
(
tensor_mode
,
patched_module
,
check
=
True
):
with
mock
.
patch
(
"{}.mask_rcnn_inference"
.
format
(
patched_module
),
side_effect
=
Caffe2MaskRCNNInference
()
)
as
mocked_func
:
yield
if
check
:
assert
mocked_func
.
call_count
>
0
@
contextlib
.
contextmanager
def
mock_keypoint_rcnn_inference
(
tensor_mode
,
patched_module
,
use_heatmap_max_keypoint
,
check
=
True
):
with
mock
.
patch
(
"{}.keypoint_rcnn_inference"
.
format
(
patched_module
),
side_effect
=
Caffe2KeypointRCNNInference
(
use_heatmap_max_keypoint
),
)
as
mocked_func
:
yield
if
check
:
assert
mocked_func
.
call_count
>
0
class
ROIHeadsPatcher
:
def
__init__
(
self
,
heads
,
use_heatmap_max_keypoint
):
self
.
heads
=
heads
self
.
use_heatmap_max_keypoint
=
use_heatmap_max_keypoint
@
contextlib
.
contextmanager
def
mock_roi_heads
(
self
,
tensor_mode
=
True
):
"""
Patching several inference functions inside ROIHeads and its subclasses
Args:
tensor_mode (bool): whether the inputs/outputs are caffe2's tensor
format or not. Default to True.
"""
# NOTE: this requries the `keypoint_rcnn_inference` and `mask_rcnn_inference`
# are called inside the same file as BaseXxxHead due to using mock.patch.
kpt_heads_mod
=
keypoint_head
.
BaseKeypointRCNNHead
.
__module__
mask_head_mod
=
mask_head
.
BaseMaskRCNNHead
.
__module__
mock_ctx_managers
=
[
mock_fastrcnn_outputs_inference
(
tensor_mode
=
tensor_mode
,
check
=
True
,
box_predictor_type
=
type
(
self
.
heads
.
box_predictor
),
)
]
if
getattr
(
self
.
heads
,
"keypoint_on"
,
False
):
mock_ctx_managers
+=
[
mock_keypoint_rcnn_inference
(
tensor_mode
,
kpt_heads_mod
,
self
.
use_heatmap_max_keypoint
)
]
if
getattr
(
self
.
heads
,
"mask_on"
,
False
):
mock_ctx_managers
+=
[
mock_mask_rcnn_inference
(
tensor_mode
,
mask_head_mod
)]
with
contextlib
.
ExitStack
()
as
stack
:
# python 3.3+
for
mgr
in
mock_ctx_managers
:
stack
.
enter_context
(
mgr
)
yield
data_generation/grit/third_party/CenterNet2/detectron2/export/flatten.py
0 → 100644
View file @
b1e6136c
# Copyright (c) Facebook, Inc. and its affiliates.
import
collections
from
dataclasses
import
dataclass
from
typing
import
Callable
,
List
,
Optional
,
Tuple
import
torch
from
torch
import
nn
from
detectron2.structures
import
Boxes
,
Instances
,
ROIMasks
from
detectron2.utils.registry
import
_convert_target_to_string
,
locate
from
.torchscript_patch
import
patch_builtin_len
@
dataclass
class
Schema
:
"""
A Schema defines how to flatten a possibly hierarchical object into tuple of
primitive objects, so it can be used as inputs/outputs of PyTorch's tracing.
PyTorch does not support tracing a function that produces rich output
structures (e.g. dict, Instances, Boxes). To trace such a function, we
flatten the rich object into tuple of tensors, and return this tuple of tensors
instead. Meanwhile, we also need to know how to "rebuild" the original object
from the flattened results, so we can evaluate the flattened results.
A Schema defines how to flatten an object, and while flattening it, it records
necessary schemas so that the object can be rebuilt using the flattened outputs.
The flattened object and the schema object is returned by ``.flatten`` classmethod.
Then the original object can be rebuilt with the ``__call__`` method of schema.
A Schema is a dataclass that can be serialized easily.
"""
# inspired by FetchMapper in tensorflow/python/client/session.py
@
classmethod
def
flatten
(
cls
,
obj
):
raise
NotImplementedError
def
__call__
(
self
,
values
):
raise
NotImplementedError
@
staticmethod
def
_concat
(
values
):
ret
=
()
sizes
=
[]
for
v
in
values
:
assert
isinstance
(
v
,
tuple
),
"Flattened results must be a tuple"
ret
=
ret
+
v
sizes
.
append
(
len
(
v
))
return
ret
,
sizes
@
staticmethod
def
_split
(
values
,
sizes
):
if
len
(
sizes
):
expected_len
=
sum
(
sizes
)
assert
(
len
(
values
)
==
expected_len
),
f
"Values has length
{
len
(
values
)
}
but expect length
{
expected_len
}
."
ret
=
[]
for
k
in
range
(
len
(
sizes
)):
begin
,
end
=
sum
(
sizes
[:
k
]),
sum
(
sizes
[:
k
+
1
])
ret
.
append
(
values
[
begin
:
end
])
return
ret
@
dataclass
class
ListSchema
(
Schema
):
schemas
:
List
[
Schema
]
# the schemas that define how to flatten each element in the list
sizes
:
List
[
int
]
# the flattened length of each element
def
__call__
(
self
,
values
):
values
=
self
.
_split
(
values
,
self
.
sizes
)
if
len
(
values
)
!=
len
(
self
.
schemas
):
raise
ValueError
(
f
"Values has length
{
len
(
values
)
}
but schemas "
f
"has length
{
len
(
self
.
schemas
)
}
!"
)
values
=
[
m
(
v
)
for
m
,
v
in
zip
(
self
.
schemas
,
values
)]
return
list
(
values
)
@
classmethod
def
flatten
(
cls
,
obj
):
res
=
[
flatten_to_tuple
(
k
)
for
k
in
obj
]
values
,
sizes
=
cls
.
_concat
([
k
[
0
]
for
k
in
res
])
return
values
,
cls
([
k
[
1
]
for
k
in
res
],
sizes
)
@
dataclass
class
TupleSchema
(
ListSchema
):
def
__call__
(
self
,
values
):
return
tuple
(
super
().
__call__
(
values
))
@
dataclass
class
IdentitySchema
(
Schema
):
def
__call__
(
self
,
values
):
return
values
[
0
]
@
classmethod
def
flatten
(
cls
,
obj
):
return
(
obj
,),
cls
()
@
dataclass
class
DictSchema
(
ListSchema
):
keys
:
List
[
str
]
def
__call__
(
self
,
values
):
values
=
super
().
__call__
(
values
)
return
dict
(
zip
(
self
.
keys
,
values
))
@
classmethod
def
flatten
(
cls
,
obj
):
for
k
in
obj
.
keys
():
if
not
isinstance
(
k
,
str
):
raise
KeyError
(
"Only support flattening dictionaries if keys are str."
)
keys
=
sorted
(
obj
.
keys
())
values
=
[
obj
[
k
]
for
k
in
keys
]
ret
,
schema
=
ListSchema
.
flatten
(
values
)
return
ret
,
cls
(
schema
.
schemas
,
schema
.
sizes
,
keys
)
@
dataclass
class
InstancesSchema
(
DictSchema
):
def
__call__
(
self
,
values
):
image_size
,
fields
=
values
[
-
1
],
values
[:
-
1
]
fields
=
super
().
__call__
(
fields
)
return
Instances
(
image_size
,
**
fields
)
@
classmethod
def
flatten
(
cls
,
obj
):
ret
,
schema
=
super
().
flatten
(
obj
.
get_fields
())
size
=
obj
.
image_size
if
not
isinstance
(
size
,
torch
.
Tensor
):
size
=
torch
.
tensor
(
size
)
return
ret
+
(
size
,),
schema
@
dataclass
class
TensorWrapSchema
(
Schema
):
"""
For classes that are simple wrapper of tensors, e.g.
Boxes, RotatedBoxes, BitMasks
"""
class_name
:
str
def
__call__
(
self
,
values
):
return
locate
(
self
.
class_name
)(
values
[
0
])
@
classmethod
def
flatten
(
cls
,
obj
):
return
(
obj
.
tensor
,),
cls
(
_convert_target_to_string
(
type
(
obj
)))
# if more custom structures needed in the future, can allow
# passing in extra schemas for custom types
def
flatten_to_tuple
(
obj
):
"""
Flatten an object so it can be used for PyTorch tracing.
Also returns how to rebuild the original object from the flattened outputs.
Returns:
res (tuple): the flattened results that can be used as tracing outputs
schema: an object with a ``__call__`` method such that ``schema(res) == obj``.
It is a pure dataclass that can be serialized.
"""
schemas
=
[
((
str
,
bytes
),
IdentitySchema
),
(
list
,
ListSchema
),
(
tuple
,
TupleSchema
),
(
collections
.
abc
.
Mapping
,
DictSchema
),
(
Instances
,
InstancesSchema
),
((
Boxes
,
ROIMasks
),
TensorWrapSchema
),
]
for
klass
,
schema
in
schemas
:
if
isinstance
(
obj
,
klass
):
F
=
schema
break
else
:
F
=
IdentitySchema
return
F
.
flatten
(
obj
)
class
TracingAdapter
(
nn
.
Module
):
"""
A model may take rich input/output format (e.g. dict or custom classes),
but `torch.jit.trace` requires tuple of tensors as input/output.
This adapter flattens input/output format of a model so it becomes traceable.
It also records the necessary schema to rebuild model's inputs/outputs from flattened
inputs/outputs.
Example:
::
outputs = model(inputs) # inputs/outputs may be rich structure
adapter = TracingAdapter(model, inputs)
# can now trace the model, with adapter.flattened_inputs, or another
# tuple of tensors with the same length and meaning
traced = torch.jit.trace(adapter, adapter.flattened_inputs)
# traced model can only produce flattened outputs (tuple of tensors)
flattened_outputs = traced(*adapter.flattened_inputs)
# adapter knows the schema to convert it back (new_outputs == outputs)
new_outputs = adapter.outputs_schema(flattened_outputs)
"""
flattened_inputs
:
Tuple
[
torch
.
Tensor
]
=
None
"""
Flattened version of inputs given to this class's constructor.
"""
inputs_schema
:
Schema
=
None
"""
Schema of the inputs given to this class's constructor.
"""
outputs_schema
:
Schema
=
None
"""
Schema of the output produced by calling the given model with inputs.
"""
def
__init__
(
self
,
model
:
nn
.
Module
,
inputs
,
inference_func
:
Optional
[
Callable
]
=
None
,
allow_non_tensor
:
bool
=
False
,
):
"""
Args:
model: an nn.Module
inputs: An input argument or a tuple of input arguments used to call model.
After flattening, it has to only consist of tensors.
inference_func: a callable that takes (model, *inputs), calls the
model with inputs, and return outputs. By default it
is ``lambda model, *inputs: model(*inputs)``. Can be override
if you need to call the model differently.
allow_non_tensor: allow inputs/outputs to contain non-tensor objects.
This option will filter out non-tensor objects to make the
model traceable, but ``inputs_schema``/``outputs_schema`` cannot be
used anymore because inputs/outputs cannot be rebuilt from pure tensors.
This is useful when you're only interested in the single trace of
execution (e.g. for flop count), but not interested in
generalizing the traced graph to new inputs.
"""
super
().
__init__
()
if
isinstance
(
model
,
(
nn
.
parallel
.
distributed
.
DistributedDataParallel
,
nn
.
DataParallel
)):
model
=
model
.
module
self
.
model
=
model
if
not
isinstance
(
inputs
,
tuple
):
inputs
=
(
inputs
,)
self
.
inputs
=
inputs
self
.
allow_non_tensor
=
allow_non_tensor
if
inference_func
is
None
:
inference_func
=
lambda
model
,
*
inputs
:
model
(
*
inputs
)
# noqa
self
.
inference_func
=
inference_func
self
.
flattened_inputs
,
self
.
inputs_schema
=
flatten_to_tuple
(
inputs
)
if
all
(
isinstance
(
x
,
torch
.
Tensor
)
for
x
in
self
.
flattened_inputs
):
return
if
self
.
allow_non_tensor
:
self
.
flattened_inputs
=
tuple
(
[
x
for
x
in
self
.
flattened_inputs
if
isinstance
(
x
,
torch
.
Tensor
)]
)
self
.
inputs_schema
=
None
else
:
for
input
in
self
.
flattened_inputs
:
if
not
isinstance
(
input
,
torch
.
Tensor
):
raise
ValueError
(
"Inputs for tracing must only contain tensors. "
f
"Got a
{
type
(
input
)
}
instead."
)
def
forward
(
self
,
*
args
:
torch
.
Tensor
):
with
torch
.
no_grad
(),
patch_builtin_len
():
if
self
.
inputs_schema
is
not
None
:
inputs_orig_format
=
self
.
inputs_schema
(
args
)
else
:
if
len
(
args
)
!=
len
(
self
.
flattened_inputs
)
or
any
(
x
is
not
y
for
x
,
y
in
zip
(
args
,
self
.
flattened_inputs
)
):
raise
ValueError
(
"TracingAdapter does not contain valid inputs_schema."
" So it cannot generalize to other inputs and must be"
" traced with `.flattened_inputs`."
)
inputs_orig_format
=
self
.
inputs
outputs
=
self
.
inference_func
(
self
.
model
,
*
inputs_orig_format
)
flattened_outputs
,
schema
=
flatten_to_tuple
(
outputs
)
flattened_output_tensors
=
tuple
(
[
x
for
x
in
flattened_outputs
if
isinstance
(
x
,
torch
.
Tensor
)]
)
if
len
(
flattened_output_tensors
)
<
len
(
flattened_outputs
):
if
self
.
allow_non_tensor
:
flattened_outputs
=
flattened_output_tensors
self
.
outputs_schema
=
None
else
:
raise
ValueError
(
"Model cannot be traced because some model outputs "
"cannot flatten to tensors."
)
else
:
# schema is valid
if
self
.
outputs_schema
is
None
:
self
.
outputs_schema
=
schema
else
:
assert
self
.
outputs_schema
==
schema
,
(
"Model should always return outputs with the same "
"structure so it can be traced!"
)
return
flattened_outputs
def
_create_wrapper
(
self
,
traced_model
):
"""
Return a function that has an input/output interface the same as the
original model, but it calls the given traced model under the hood.
"""
def
forward
(
*
args
):
flattened_inputs
,
_
=
flatten_to_tuple
(
args
)
flattened_outputs
=
traced_model
(
*
flattened_inputs
)
return
self
.
outputs_schema
(
flattened_outputs
)
return
forward
data_generation/grit/third_party/CenterNet2/detectron2/export/shared.py
0 → 100644
View file @
b1e6136c
# Copyright (c) Facebook, Inc. and its affiliates.
import
collections
import
contextlib
import
copy
import
functools
import
logging
import
numpy
as
np
import
os
from
typing
import
Any
,
Callable
,
Dict
,
List
,
Optional
,
Tuple
,
Union
from
unittest
import
mock
import
caffe2.python.utils
as
putils
import
torch
import
torch.nn.functional
as
F
from
caffe2.proto
import
caffe2_pb2
from
caffe2.python
import
core
,
net_drawer
,
workspace
from
torch.nn.functional
import
interpolate
as
interp
logger
=
logging
.
getLogger
(
__name__
)
# ==== torch/utils_toffee/cast.py =======================================
def
to_device
(
t
,
device_str
):
"""
This function is a replacement of .to(another_device) such that it allows the
casting to be traced properly by explicitly calling the underlying copy ops.
It also avoids introducing unncessary op when casting to the same device.
"""
src
=
t
.
device
dst
=
torch
.
device
(
device_str
)
if
src
==
dst
:
return
t
elif
src
.
type
==
"cuda"
and
dst
.
type
==
"cpu"
:
return
torch
.
ops
.
_caffe2
.
CopyGPUToCPU
(
t
)
elif
src
.
type
==
"cpu"
and
dst
.
type
==
"cuda"
:
return
torch
.
ops
.
_caffe2
.
CopyCPUToGPU
(
t
)
else
:
raise
RuntimeError
(
"Can't cast tensor from device {} to device {}"
.
format
(
src
,
dst
))
# ==== torch/utils_toffee/interpolate.py =======================================
# Note: borrowed from vision/detection/fair/detectron/detectron/modeling/detector.py
def
BilinearInterpolation
(
tensor_in
,
up_scale
):
assert
up_scale
%
2
==
0
,
"Scale should be even"
def
upsample_filt
(
size
):
factor
=
(
size
+
1
)
//
2
if
size
%
2
==
1
:
center
=
factor
-
1
else
:
center
=
factor
-
0.5
og
=
np
.
ogrid
[:
size
,
:
size
]
return
(
1
-
abs
(
og
[
0
]
-
center
)
/
factor
)
*
(
1
-
abs
(
og
[
1
]
-
center
)
/
factor
)
kernel_size
=
int
(
up_scale
)
*
2
bil_filt
=
upsample_filt
(
kernel_size
)
dim
=
int
(
tensor_in
.
shape
[
1
])
kernel
=
np
.
zeros
((
dim
,
dim
,
kernel_size
,
kernel_size
),
dtype
=
np
.
float32
)
kernel
[
range
(
dim
),
range
(
dim
),
:,
:]
=
bil_filt
tensor_out
=
F
.
conv_transpose2d
(
tensor_in
,
weight
=
to_device
(
torch
.
Tensor
(
kernel
),
tensor_in
.
device
),
bias
=
None
,
stride
=
int
(
up_scale
),
padding
=
int
(
up_scale
/
2
),
)
return
tensor_out
# NOTE: ONNX is incompatible with traced torch.nn.functional.interpolate if
# using dynamic `scale_factor` rather than static `size`. (T43166860)
# NOTE: Caffe2 Int8 conversion might not be able to quantize `size` properly.
def
onnx_compatibale_interpolate
(
input
,
size
=
None
,
scale_factor
=
None
,
mode
=
"nearest"
,
align_corners
=
None
):
# NOTE: The input dimensions are interpreted in the form:
# `mini-batch x channels x [optional depth] x [optional height] x width`.
if
size
is
None
and
scale_factor
is
not
None
:
if
input
.
dim
()
==
4
:
if
isinstance
(
scale_factor
,
(
int
,
float
)):
height_scale
,
width_scale
=
(
scale_factor
,
scale_factor
)
else
:
assert
isinstance
(
scale_factor
,
(
tuple
,
list
))
assert
len
(
scale_factor
)
==
2
height_scale
,
width_scale
=
scale_factor
assert
not
align_corners
,
"No matching C2 op for align_corners == True"
if
mode
==
"nearest"
:
return
torch
.
ops
.
_caffe2
.
ResizeNearest
(
input
,
order
=
"NCHW"
,
width_scale
=
width_scale
,
height_scale
=
height_scale
)
elif
mode
==
"bilinear"
:
logger
.
warning
(
"Use F.conv_transpose2d for bilinear interpolate"
" because there's no such C2 op, this may cause significant"
" slowdown and the boundary pixels won't be as same as"
" using F.interpolate due to padding."
)
assert
height_scale
==
width_scale
return
BilinearInterpolation
(
input
,
up_scale
=
height_scale
)
logger
.
warning
(
"Output size is not static, it might cause ONNX conversion issue"
)
return
interp
(
input
,
size
,
scale_factor
,
mode
,
align_corners
)
@
contextlib
.
contextmanager
def
mock_torch_nn_functional_interpolate
():
if
torch
.
onnx
.
is_in_onnx_export
():
with
mock
.
patch
(
"torch.nn.functional.interpolate"
,
side_effect
=
onnx_compatibale_interpolate
):
yield
else
:
yield
# ==== torch/utils_caffe2/ws_utils.py ==========================================
class
ScopedWS
(
object
):
def
__init__
(
self
,
ws_name
,
is_reset
,
is_cleanup
=
False
):
self
.
ws_name
=
ws_name
self
.
is_reset
=
is_reset
self
.
is_cleanup
=
is_cleanup
self
.
org_ws
=
""
def
__enter__
(
self
):
self
.
org_ws
=
workspace
.
CurrentWorkspace
()
if
self
.
ws_name
is
not
None
:
workspace
.
SwitchWorkspace
(
self
.
ws_name
,
True
)
if
self
.
is_reset
:
workspace
.
ResetWorkspace
()
return
workspace
def
__exit__
(
self
,
*
args
):
if
self
.
is_cleanup
:
workspace
.
ResetWorkspace
()
if
self
.
ws_name
is
not
None
:
workspace
.
SwitchWorkspace
(
self
.
org_ws
)
def
fetch_any_blob
(
name
):
bb
=
None
try
:
bb
=
workspace
.
FetchBlob
(
name
)
except
TypeError
:
bb
=
workspace
.
FetchInt8Blob
(
name
)
except
Exception
as
e
:
logger
.
error
(
"Get blob {} error: {}"
.
format
(
name
,
e
))
return
bb
# ==== torch/utils_caffe2/protobuf.py ==========================================
def
get_pb_arg
(
pb
,
arg_name
):
for
x
in
pb
.
arg
:
if
x
.
name
==
arg_name
:
return
x
return
None
def
get_pb_arg_valf
(
pb
,
arg_name
,
default_val
):
arg
=
get_pb_arg
(
pb
,
arg_name
)
return
arg
.
f
if
arg
is
not
None
else
default_val
def
get_pb_arg_floats
(
pb
,
arg_name
,
default_val
):
arg
=
get_pb_arg
(
pb
,
arg_name
)
return
list
(
map
(
float
,
arg
.
floats
))
if
arg
is
not
None
else
default_val
def
get_pb_arg_ints
(
pb
,
arg_name
,
default_val
):
arg
=
get_pb_arg
(
pb
,
arg_name
)
return
list
(
map
(
int
,
arg
.
ints
))
if
arg
is
not
None
else
default_val
def
get_pb_arg_vali
(
pb
,
arg_name
,
default_val
):
arg
=
get_pb_arg
(
pb
,
arg_name
)
return
arg
.
i
if
arg
is
not
None
else
default_val
def
get_pb_arg_vals
(
pb
,
arg_name
,
default_val
):
arg
=
get_pb_arg
(
pb
,
arg_name
)
return
arg
.
s
if
arg
is
not
None
else
default_val
def
get_pb_arg_valstrings
(
pb
,
arg_name
,
default_val
):
arg
=
get_pb_arg
(
pb
,
arg_name
)
return
list
(
arg
.
strings
)
if
arg
is
not
None
else
default_val
def
check_set_pb_arg
(
pb
,
arg_name
,
arg_attr
,
arg_value
,
allow_override
=
False
):
arg
=
get_pb_arg
(
pb
,
arg_name
)
if
arg
is
None
:
arg
=
putils
.
MakeArgument
(
arg_name
,
arg_value
)
assert
hasattr
(
arg
,
arg_attr
)
pb
.
arg
.
extend
([
arg
])
if
allow_override
and
getattr
(
arg
,
arg_attr
)
!=
arg_value
:
logger
.
warning
(
"Override argument {}: {} -> {}"
.
format
(
arg_name
,
getattr
(
arg
,
arg_attr
),
arg_value
)
)
setattr
(
arg
,
arg_attr
,
arg_value
)
else
:
assert
arg
is
not
None
assert
getattr
(
arg
,
arg_attr
)
==
arg_value
,
"Existing value {}, new value {}"
.
format
(
getattr
(
arg
,
arg_attr
),
arg_value
)
def
_create_const_fill_op_from_numpy
(
name
,
tensor
,
device_option
=
None
):
assert
type
(
tensor
)
==
np
.
ndarray
kTypeNameMapper
=
{
np
.
dtype
(
"float32"
):
"GivenTensorFill"
,
np
.
dtype
(
"int32"
):
"GivenTensorIntFill"
,
np
.
dtype
(
"int64"
):
"GivenTensorInt64Fill"
,
np
.
dtype
(
"uint8"
):
"GivenTensorStringFill"
,
}
args_dict
=
{}
if
tensor
.
dtype
==
np
.
dtype
(
"uint8"
):
args_dict
.
update
({
"values"
:
[
str
(
tensor
.
data
)],
"shape"
:
[
1
]})
else
:
args_dict
.
update
({
"values"
:
tensor
,
"shape"
:
tensor
.
shape
})
if
device_option
is
not
None
:
args_dict
[
"device_option"
]
=
device_option
return
core
.
CreateOperator
(
kTypeNameMapper
[
tensor
.
dtype
],
[],
[
name
],
**
args_dict
)
def
_create_const_fill_op_from_c2_int8_tensor
(
name
,
int8_tensor
):
assert
type
(
int8_tensor
)
==
workspace
.
Int8Tensor
kTypeNameMapper
=
{
np
.
dtype
(
"int32"
):
"Int8GivenIntTensorFill"
,
np
.
dtype
(
"uint8"
):
"Int8GivenTensorFill"
,
}
tensor
=
int8_tensor
.
data
assert
tensor
.
dtype
in
[
np
.
dtype
(
"uint8"
),
np
.
dtype
(
"int32"
)]
values
=
tensor
.
tobytes
()
if
tensor
.
dtype
==
np
.
dtype
(
"uint8"
)
else
tensor
return
core
.
CreateOperator
(
kTypeNameMapper
[
tensor
.
dtype
],
[],
[
name
],
values
=
values
,
shape
=
tensor
.
shape
,
Y_scale
=
int8_tensor
.
scale
,
Y_zero_point
=
int8_tensor
.
zero_point
,
)
def
create_const_fill_op
(
name
:
str
,
blob
:
Union
[
np
.
ndarray
,
workspace
.
Int8Tensor
],
device_option
:
Optional
[
caffe2_pb2
.
DeviceOption
]
=
None
,
)
->
caffe2_pb2
.
OperatorDef
:
"""
Given a blob object, return the Caffe2 operator that creates this blob
as constant. Currently support NumPy tensor and Caffe2 Int8Tensor.
"""
tensor_type
=
type
(
blob
)
assert
tensor_type
in
[
np
.
ndarray
,
workspace
.
Int8Tensor
,
],
'Error when creating const fill op for "{}", unsupported blob type: {}'
.
format
(
name
,
type
(
blob
)
)
if
tensor_type
==
np
.
ndarray
:
return
_create_const_fill_op_from_numpy
(
name
,
blob
,
device_option
)
elif
tensor_type
==
workspace
.
Int8Tensor
:
assert
device_option
is
None
return
_create_const_fill_op_from_c2_int8_tensor
(
name
,
blob
)
def
construct_init_net_from_params
(
params
:
Dict
[
str
,
Any
],
device_options
:
Optional
[
Dict
[
str
,
caffe2_pb2
.
DeviceOption
]]
=
None
)
->
caffe2_pb2
.
NetDef
:
"""
Construct the init_net from params dictionary
"""
init_net
=
caffe2_pb2
.
NetDef
()
device_options
=
device_options
or
{}
for
name
,
blob
in
params
.
items
():
if
isinstance
(
blob
,
str
):
logger
.
warning
(
(
"Blob {} with type {} is not supported in generating init net,"
" skipped."
.
format
(
name
,
type
(
blob
))
)
)
continue
init_net
.
op
.
extend
(
[
create_const_fill_op
(
name
,
blob
,
device_option
=
device_options
.
get
(
name
,
None
))]
)
init_net
.
external_output
.
append
(
name
)
return
init_net
def
get_producer_map
(
ssa
):
"""
Return dict from versioned blob to (i, j),
where i is index of producer op, j is the index of output of that op.
"""
producer_map
=
{}
for
i
in
range
(
len
(
ssa
)):
outputs
=
ssa
[
i
][
1
]
for
j
,
outp
in
enumerate
(
outputs
):
producer_map
[
outp
]
=
(
i
,
j
)
return
producer_map
def
get_consumer_map
(
ssa
):
"""
Return dict from versioned blob to list of (i, j),
where i is index of consumer op, j is the index of input of that op.
"""
consumer_map
=
collections
.
defaultdict
(
list
)
for
i
in
range
(
len
(
ssa
)):
inputs
=
ssa
[
i
][
0
]
for
j
,
inp
in
enumerate
(
inputs
):
consumer_map
[
inp
].
append
((
i
,
j
))
return
consumer_map
def
get_params_from_init_net
(
init_net
:
caffe2_pb2
.
NetDef
,
)
->
[
Dict
[
str
,
Any
],
Dict
[
str
,
caffe2_pb2
.
DeviceOption
]]:
"""
Take the output blobs from init_net by running it.
Outputs:
params: dict from blob name to numpy array
device_options: dict from blob name to the device option of its creating op
"""
# NOTE: this assumes that the params is determined by producer op with the
# only exception be CopyGPUToCPU which is CUDA op but returns CPU tensor.
def
_get_device_option
(
producer_op
):
if
producer_op
.
type
==
"CopyGPUToCPU"
:
return
caffe2_pb2
.
DeviceOption
()
else
:
return
producer_op
.
device_option
with
ScopedWS
(
"__get_params_from_init_net__"
,
is_reset
=
True
,
is_cleanup
=
True
)
as
ws
:
ws
.
RunNetOnce
(
init_net
)
params
=
{
b
:
fetch_any_blob
(
b
)
for
b
in
init_net
.
external_output
}
ssa
,
versions
=
core
.
get_ssa
(
init_net
)
producer_map
=
get_producer_map
(
ssa
)
device_options
=
{
b
:
_get_device_option
(
init_net
.
op
[
producer_map
[(
b
,
versions
[
b
])][
0
]])
for
b
in
init_net
.
external_output
}
return
params
,
device_options
def
_updater_raise
(
op
,
input_types
,
output_types
):
raise
RuntimeError
(
"Failed to apply updater for op {} given input_types {} and"
" output_types {}"
.
format
(
op
,
input_types
,
output_types
)
)
def
_generic_status_identifier
(
predict_net
:
caffe2_pb2
.
NetDef
,
status_updater
:
Callable
,
known_status
:
Dict
[
Tuple
[
str
,
int
],
Any
],
)
->
Dict
[
Tuple
[
str
,
int
],
Any
]:
"""
Statically infer the status of each blob, the status can be such as device type
(CPU/GPU), layout (NCHW/NHWC), data type (float32/int8), etc. "Blob" here
is versioned blob (Tuple[str, int]) in the format compatible with ssa.
Inputs:
predict_net: the caffe2 network
status_updater: a callable, given an op and the status of its input/output,
it returns the updated status of input/output. `None` is used for
representing unknown status.
known_status: a dict containing known status, used as initialization.
Outputs:
A dict mapping from versioned blob to its status
"""
ssa
,
versions
=
core
.
get_ssa
(
predict_net
)
versioned_ext_input
=
[(
b
,
0
)
for
b
in
predict_net
.
external_input
]
versioned_ext_output
=
[(
b
,
versions
[
b
])
for
b
in
predict_net
.
external_output
]
all_versioned_blobs
=
set
().
union
(
*
[
set
(
x
[
0
]
+
x
[
1
])
for
x
in
ssa
])
allowed_vbs
=
all_versioned_blobs
.
union
(
versioned_ext_input
).
union
(
versioned_ext_output
)
assert
all
(
k
in
allowed_vbs
for
k
in
known_status
)
assert
all
(
v
is
not
None
for
v
in
known_status
.
values
())
_known_status
=
copy
.
deepcopy
(
known_status
)
def
_check_and_update
(
key
,
value
):
assert
value
is
not
None
if
key
in
_known_status
:
if
not
_known_status
[
key
]
==
value
:
raise
RuntimeError
(
"Confilict status for {}, existing status {}, new status {}"
.
format
(
key
,
_known_status
[
key
],
value
)
)
_known_status
[
key
]
=
value
def
_update_i
(
op
,
ssa_i
):
versioned_inputs
=
ssa_i
[
0
]
versioned_outputs
=
ssa_i
[
1
]
inputs_status
=
[
_known_status
.
get
(
b
,
None
)
for
b
in
versioned_inputs
]
outputs_status
=
[
_known_status
.
get
(
b
,
None
)
for
b
in
versioned_outputs
]
new_inputs_status
,
new_outputs_status
=
status_updater
(
op
,
inputs_status
,
outputs_status
)
for
versioned_blob
,
status
in
zip
(
versioned_inputs
+
versioned_outputs
,
new_inputs_status
+
new_outputs_status
):
if
status
is
not
None
:
_check_and_update
(
versioned_blob
,
status
)
for
op
,
ssa_i
in
zip
(
predict_net
.
op
,
ssa
):
_update_i
(
op
,
ssa_i
)
for
op
,
ssa_i
in
zip
(
reversed
(
predict_net
.
op
),
reversed
(
ssa
)):
_update_i
(
op
,
ssa_i
)
# NOTE: This strictly checks all the blob from predict_net must be assgined
# a known status. However sometimes it's impossible (eg. having deadend op),
# we may relax this constraint if
for
k
in
all_versioned_blobs
:
if
k
not
in
_known_status
:
raise
NotImplementedError
(
"Can not infer the status for {}. Currently only support the case where"
" a single forward and backward pass can identify status for all blobs."
.
format
(
k
)
)
return
_known_status
def
infer_device_type
(
predict_net
:
caffe2_pb2
.
NetDef
,
known_status
:
Dict
[
Tuple
[
str
,
int
],
Any
],
device_name_style
:
str
=
"caffe2"
,
)
->
Dict
[
Tuple
[
str
,
int
],
str
]:
"""Return the device type ("cpu" or "gpu"/"cuda") of each (versioned) blob"""
assert
device_name_style
in
[
"caffe2"
,
"pytorch"
]
_CPU_STR
=
"cpu"
_GPU_STR
=
"gpu"
if
device_name_style
==
"caffe2"
else
"cuda"
def
_copy_cpu_to_gpu_updater
(
op
,
input_types
,
output_types
):
if
input_types
[
0
]
==
_GPU_STR
or
output_types
[
0
]
==
_CPU_STR
:
_updater_raise
(
op
,
input_types
,
output_types
)
return
([
_CPU_STR
],
[
_GPU_STR
])
def
_copy_gpu_to_cpu_updater
(
op
,
input_types
,
output_types
):
if
input_types
[
0
]
==
_CPU_STR
or
output_types
[
0
]
==
_GPU_STR
:
_updater_raise
(
op
,
input_types
,
output_types
)
return
([
_GPU_STR
],
[
_CPU_STR
])
def
_other_ops_updater
(
op
,
input_types
,
output_types
):
non_none_types
=
[
x
for
x
in
input_types
+
output_types
if
x
is
not
None
]
if
len
(
non_none_types
)
>
0
:
the_type
=
non_none_types
[
0
]
if
not
all
(
x
==
the_type
for
x
in
non_none_types
):
_updater_raise
(
op
,
input_types
,
output_types
)
else
:
the_type
=
None
return
([
the_type
for
_
in
op
.
input
],
[
the_type
for
_
in
op
.
output
])
def
_device_updater
(
op
,
*
args
,
**
kwargs
):
return
{
"CopyCPUToGPU"
:
_copy_cpu_to_gpu_updater
,
"CopyGPUToCPU"
:
_copy_gpu_to_cpu_updater
,
}.
get
(
op
.
type
,
_other_ops_updater
)(
op
,
*
args
,
**
kwargs
)
return
_generic_status_identifier
(
predict_net
,
_device_updater
,
known_status
)
# ==== torch/utils_caffe2/vis.py ===============================================
def
_modify_blob_names
(
ops
,
blob_rename_f
):
ret
=
[]
def
_replace_list
(
blob_list
,
replaced_list
):
del
blob_list
[:]
blob_list
.
extend
(
replaced_list
)
for
x
in
ops
:
cur
=
copy
.
deepcopy
(
x
)
_replace_list
(
cur
.
input
,
list
(
map
(
blob_rename_f
,
cur
.
input
)))
_replace_list
(
cur
.
output
,
list
(
map
(
blob_rename_f
,
cur
.
output
)))
ret
.
append
(
cur
)
return
ret
def
_rename_blob
(
name
,
blob_sizes
,
blob_ranges
):
def
_list_to_str
(
bsize
):
ret
=
", "
.
join
([
str
(
x
)
for
x
in
bsize
])
ret
=
"["
+
ret
+
"]"
return
ret
ret
=
name
if
blob_sizes
is
not
None
and
name
in
blob_sizes
:
ret
+=
"
\n
"
+
_list_to_str
(
blob_sizes
[
name
])
if
blob_ranges
is
not
None
and
name
in
blob_ranges
:
ret
+=
"
\n
"
+
_list_to_str
(
blob_ranges
[
name
])
return
ret
# graph_name could not contain word 'graph'
def
save_graph
(
net
,
file_name
,
graph_name
=
"net"
,
op_only
=
True
,
blob_sizes
=
None
,
blob_ranges
=
None
):
blob_rename_f
=
functools
.
partial
(
_rename_blob
,
blob_sizes
=
blob_sizes
,
blob_ranges
=
blob_ranges
)
return
save_graph_base
(
net
,
file_name
,
graph_name
,
op_only
,
blob_rename_f
)
def
save_graph_base
(
net
,
file_name
,
graph_name
=
"net"
,
op_only
=
True
,
blob_rename_func
=
None
):
graph
=
None
ops
=
net
.
op
if
blob_rename_func
is
not
None
:
ops
=
_modify_blob_names
(
ops
,
blob_rename_func
)
if
not
op_only
:
graph
=
net_drawer
.
GetPydotGraph
(
ops
,
graph_name
,
rankdir
=
"TB"
)
else
:
graph
=
net_drawer
.
GetPydotGraphMinimal
(
ops
,
graph_name
,
rankdir
=
"TB"
,
minimal_dependency
=
True
)
try
:
par_dir
=
os
.
path
.
dirname
(
file_name
)
if
not
os
.
path
.
exists
(
par_dir
):
os
.
makedirs
(
par_dir
)
format
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
file_name
))[
-
1
]
if
format
==
".png"
:
graph
.
write_png
(
file_name
)
elif
format
==
".pdf"
:
graph
.
write_pdf
(
file_name
)
elif
format
==
".svg"
:
graph
.
write_svg
(
file_name
)
else
:
print
(
"Incorrect format {}"
.
format
(
format
))
except
Exception
as
e
:
print
(
"Error when writing graph to image {}"
.
format
(
e
))
return
graph
# ==== torch/utils_toffee/aten_to_caffe2.py ====================================
def
group_norm_replace_aten_with_caffe2
(
predict_net
:
caffe2_pb2
.
NetDef
):
"""
For ONNX exported model, GroupNorm will be represented as ATen op,
this can be a drop in replacement from ATen to GroupNorm
"""
count
=
0
for
op
in
predict_net
.
op
:
if
op
.
type
==
"ATen"
:
op_name
=
get_pb_arg_vals
(
op
,
"operator"
,
None
)
# return byte in py3
if
op_name
and
op_name
.
decode
()
==
"group_norm"
:
op
.
arg
.
remove
(
get_pb_arg
(
op
,
"operator"
))
if
get_pb_arg_vali
(
op
,
"cudnn_enabled"
,
None
):
op
.
arg
.
remove
(
get_pb_arg
(
op
,
"cudnn_enabled"
))
num_groups
=
get_pb_arg_vali
(
op
,
"num_groups"
,
None
)
if
num_groups
is
not
None
:
op
.
arg
.
remove
(
get_pb_arg
(
op
,
"num_groups"
))
check_set_pb_arg
(
op
,
"group"
,
"i"
,
num_groups
)
op
.
type
=
"GroupNorm"
count
+=
1
if
count
>
1
:
logger
.
info
(
"Replaced {} ATen operator to GroupNormOp"
.
format
(
count
))
# ==== torch/utils_toffee/alias.py =============================================
def
alias
(
x
,
name
,
is_backward
=
False
):
if
not
torch
.
onnx
.
is_in_onnx_export
():
return
x
assert
isinstance
(
x
,
torch
.
Tensor
)
return
torch
.
ops
.
_caffe2
.
AliasWithName
(
x
,
name
,
is_backward
=
is_backward
)
def
fuse_alias_placeholder
(
predict_net
,
init_net
):
"""Remove AliasWithName placeholder and rename the input/output of it"""
# First we finish all the re-naming
for
i
,
op
in
enumerate
(
predict_net
.
op
):
if
op
.
type
==
"AliasWithName"
:
assert
len
(
op
.
input
)
==
1
assert
len
(
op
.
output
)
==
1
name
=
get_pb_arg_vals
(
op
,
"name"
,
None
).
decode
()
is_backward
=
bool
(
get_pb_arg_vali
(
op
,
"is_backward"
,
0
))
rename_op_input
(
predict_net
,
init_net
,
i
,
0
,
name
,
from_producer
=
is_backward
)
rename_op_output
(
predict_net
,
i
,
0
,
name
)
# Remove AliasWithName, should be very safe since it's a non-op
new_ops
=
[]
for
op
in
predict_net
.
op
:
if
op
.
type
!=
"AliasWithName"
:
new_ops
.
append
(
op
)
else
:
# safety check
assert
op
.
input
==
op
.
output
assert
op
.
input
[
0
]
==
op
.
arg
[
0
].
s
.
decode
()
del
predict_net
.
op
[:]
predict_net
.
op
.
extend
(
new_ops
)
# ==== torch/utils_caffe2/graph_transform.py ===================================
class
IllegalGraphTransformError
(
ValueError
):
"""When a graph transform function call can't be executed."""
def
_rename_versioned_blob_in_proto
(
proto
:
caffe2_pb2
.
NetDef
,
old_name
:
str
,
new_name
:
str
,
version
:
int
,
ssa
:
List
[
Tuple
[
List
[
Tuple
[
str
,
int
]],
List
[
Tuple
[
str
,
int
]]]],
start_versions
:
Dict
[
str
,
int
],
end_versions
:
Dict
[
str
,
int
],
):
"""In given proto, rename all blobs with matched version"""
# Operater list
for
op
,
i_th_ssa
in
zip
(
proto
.
op
,
ssa
):
versioned_inputs
,
versioned_outputs
=
i_th_ssa
for
i
in
range
(
len
(
op
.
input
)):
if
versioned_inputs
[
i
]
==
(
old_name
,
version
):
op
.
input
[
i
]
=
new_name
for
i
in
range
(
len
(
op
.
output
)):
if
versioned_outputs
[
i
]
==
(
old_name
,
version
):
op
.
output
[
i
]
=
new_name
# external_input
if
start_versions
.
get
(
old_name
,
0
)
==
version
:
for
i
in
range
(
len
(
proto
.
external_input
)):
if
proto
.
external_input
[
i
]
==
old_name
:
proto
.
external_input
[
i
]
=
new_name
# external_output
if
end_versions
.
get
(
old_name
,
0
)
==
version
:
for
i
in
range
(
len
(
proto
.
external_output
)):
if
proto
.
external_output
[
i
]
==
old_name
:
proto
.
external_output
[
i
]
=
new_name
def
rename_op_input
(
predict_net
:
caffe2_pb2
.
NetDef
,
init_net
:
caffe2_pb2
.
NetDef
,
op_id
:
int
,
input_id
:
int
,
new_name
:
str
,
from_producer
:
bool
=
False
,
):
"""
Rename the op_id-th operator in predict_net, change it's input_id-th input's
name to the new_name. It also does automatic re-route and change
external_input and init_net if necessary.
- It requires the input is only consumed by this op.
- This function modifies predict_net and init_net in-place.
- When from_producer is enable, this also updates other operators that consumes
the same input. Be cautious because may trigger unintended behavior.
"""
assert
isinstance
(
predict_net
,
caffe2_pb2
.
NetDef
)
assert
isinstance
(
init_net
,
caffe2_pb2
.
NetDef
)
init_net_ssa
,
init_net_versions
=
core
.
get_ssa
(
init_net
)
predict_net_ssa
,
predict_net_versions
=
core
.
get_ssa
(
predict_net
,
copy
.
deepcopy
(
init_net_versions
)
)
versioned_inputs
,
versioned_outputs
=
predict_net_ssa
[
op_id
]
old_name
,
version
=
versioned_inputs
[
input_id
]
if
from_producer
:
producer_map
=
get_producer_map
(
predict_net_ssa
)
if
not
(
old_name
,
version
)
in
producer_map
:
raise
NotImplementedError
(
"Can't find producer, the input {} is probably from"
" init_net, this is not supported yet."
.
format
(
old_name
)
)
producer
=
producer_map
[(
old_name
,
version
)]
rename_op_output
(
predict_net
,
producer
[
0
],
producer
[
1
],
new_name
)
return
def
contain_targets
(
op_ssa
):
return
(
old_name
,
version
)
in
op_ssa
[
0
]
is_consumer
=
[
contain_targets
(
op_ssa
)
for
op_ssa
in
predict_net_ssa
]
if
sum
(
is_consumer
)
>
1
:
raise
IllegalGraphTransformError
(
(
"Input '{}' of operator(#{}) are consumed by other ops, please use"
+
" rename_op_output on the producer instead. Offending op:
\n
{}"
).
format
(
old_name
,
op_id
,
predict_net
.
op
[
op_id
])
)
# update init_net
_rename_versioned_blob_in_proto
(
init_net
,
old_name
,
new_name
,
version
,
init_net_ssa
,
{},
init_net_versions
)
# update predict_net
_rename_versioned_blob_in_proto
(
predict_net
,
old_name
,
new_name
,
version
,
predict_net_ssa
,
init_net_versions
,
predict_net_versions
,
)
def
rename_op_output
(
predict_net
:
caffe2_pb2
.
NetDef
,
op_id
:
int
,
output_id
:
int
,
new_name
:
str
):
"""
Rename the op_id-th operator in predict_net, change it's output_id-th input's
name to the new_name. It also does automatic re-route and change
external_output and if necessary.
- It allows multiple consumers of its output.
- This function modifies predict_net in-place, doesn't need init_net.
"""
assert
isinstance
(
predict_net
,
caffe2_pb2
.
NetDef
)
ssa
,
blob_versions
=
core
.
get_ssa
(
predict_net
)
versioned_inputs
,
versioned_outputs
=
ssa
[
op_id
]
old_name
,
version
=
versioned_outputs
[
output_id
]
# update predict_net
_rename_versioned_blob_in_proto
(
predict_net
,
old_name
,
new_name
,
version
,
ssa
,
{},
blob_versions
)
def
get_sub_graph_external_input_output
(
predict_net
:
caffe2_pb2
.
NetDef
,
sub_graph_op_indices
:
List
[
int
]
)
->
Tuple
[
List
[
Tuple
[
str
,
int
]],
List
[
Tuple
[
str
,
int
]]]:
"""
Return the list of external input/output of sub-graph,
each element is tuple of the name and corresponding version in predict_net.
external input/output is defined the same way as caffe2 NetDef.
"""
ssa
,
versions
=
core
.
get_ssa
(
predict_net
)
all_inputs
=
[]
all_outputs
=
[]
for
op_id
in
sub_graph_op_indices
:
all_inputs
+=
[
inp
for
inp
in
ssa
[
op_id
][
0
]
if
inp
not
in
all_inputs
]
all_outputs
+=
list
(
ssa
[
op_id
][
1
])
# ssa output won't repeat
# for versioned blobs, external inputs are just those blob in all_inputs
# but not in all_outputs
ext_inputs
=
[
inp
for
inp
in
all_inputs
if
inp
not
in
all_outputs
]
# external outputs are essentially outputs of this subgraph that are used
# outside of this sub-graph (including predict_net.external_output)
all_other_inputs
=
sum
(
(
ssa
[
i
][
0
]
for
i
in
range
(
len
(
ssa
))
if
i
not
in
sub_graph_op_indices
),
[(
outp
,
versions
[
outp
])
for
outp
in
predict_net
.
external_output
],
)
ext_outputs
=
[
outp
for
outp
in
all_outputs
if
outp
in
set
(
all_other_inputs
)]
return
ext_inputs
,
ext_outputs
class
DiGraph
:
"""A DAG representation of caffe2 graph, each vertice is a versioned blob."""
def
__init__
(
self
):
self
.
vertices
=
set
()
self
.
graph
=
collections
.
defaultdict
(
list
)
def
add_edge
(
self
,
u
,
v
):
self
.
graph
[
u
].
append
(
v
)
self
.
vertices
.
add
(
u
)
self
.
vertices
.
add
(
v
)
# grab from https://www.geeksforgeeks.org/find-paths-given-source-destination/
def
get_all_paths
(
self
,
s
,
d
):
visited
=
{
k
:
False
for
k
in
self
.
vertices
}
path
=
[]
all_paths
=
[]
def
_get_all_paths_util
(
graph
,
u
,
d
,
visited
,
path
):
visited
[
u
]
=
True
path
.
append
(
u
)
if
u
==
d
:
all_paths
.
append
(
copy
.
deepcopy
(
path
))
else
:
for
i
in
graph
[
u
]:
if
not
visited
[
i
]:
_get_all_paths_util
(
graph
,
i
,
d
,
visited
,
path
)
path
.
pop
()
visited
[
u
]
=
False
_get_all_paths_util
(
self
.
graph
,
s
,
d
,
visited
,
path
)
return
all_paths
@
staticmethod
def
from_ssa
(
ssa
):
graph
=
DiGraph
()
for
op_id
in
range
(
len
(
ssa
)):
for
inp
in
ssa
[
op_id
][
0
]:
for
outp
in
ssa
[
op_id
][
1
]:
graph
.
add_edge
(
inp
,
outp
)
return
graph
def
_get_dependency_chain
(
ssa
,
versioned_target
,
versioned_source
):
"""
Return the index list of relevant operator to produce target blob from source blob,
if there's no dependency, return empty list.
"""
# finding all paths between nodes can be O(N!), thus we can only search
# in the subgraph using the op starting from the first consumer of source blob
# to the producer of the target blob.
consumer_map
=
get_consumer_map
(
ssa
)
producer_map
=
get_producer_map
(
ssa
)
start_op
=
min
(
x
[
0
]
for
x
in
consumer_map
[
versioned_source
])
-
15
end_op
=
(
producer_map
[
versioned_target
][
0
]
+
15
if
versioned_target
in
producer_map
else
start_op
)
sub_graph_ssa
=
ssa
[
start_op
:
end_op
+
1
]
if
len
(
sub_graph_ssa
)
>
30
:
logger
.
warning
(
"Subgraph bebetween {} and {} is large (from op#{} to op#{}), it"
" might take non-trival time to find all paths between them."
.
format
(
versioned_source
,
versioned_target
,
start_op
,
end_op
)
)
dag
=
DiGraph
.
from_ssa
(
sub_graph_ssa
)
paths
=
dag
.
get_all_paths
(
versioned_source
,
versioned_target
)
# include two ends
ops_in_paths
=
[[
producer_map
[
blob
][
0
]
for
blob
in
path
[
1
:]]
for
path
in
paths
]
return
sorted
(
set
().
union
(
*
[
set
(
ops
)
for
ops
in
ops_in_paths
]))
def
identify_reshape_sub_graph
(
predict_net
:
caffe2_pb2
.
NetDef
)
->
List
[
List
[
int
]]:
"""
Idenfity the reshape sub-graph in a protobuf.
The reshape sub-graph is defined as matching the following pattern:
(input_blob) -> Op_1 -> ... -> Op_N -> (new_shape) -─┐
└-------------------------------------------> Reshape -> (output_blob)
Return:
List of sub-graphs, each sub-graph is represented as a list of indices
of the relavent ops, [Op_1, Op_2, ..., Op_N, Reshape]
"""
ssa
,
_
=
core
.
get_ssa
(
predict_net
)
ret
=
[]
for
i
,
op
in
enumerate
(
predict_net
.
op
):
if
op
.
type
==
"Reshape"
:
assert
len
(
op
.
input
)
==
2
input_ssa
=
ssa
[
i
][
0
]
data_source
=
input_ssa
[
0
]
shape_source
=
input_ssa
[
1
]
op_indices
=
_get_dependency_chain
(
ssa
,
shape_source
,
data_source
)
ret
.
append
(
op_indices
+
[
i
])
return
ret
def
remove_reshape_for_fc
(
predict_net
,
params
):
"""
In PyTorch nn.Linear has to take 2D tensor, this often leads to reshape
a 4D tensor to 2D by calling .view(). However this (dynamic) reshaping
doesn't work well with ONNX and Int8 tools, and cause using extra
ops (eg. ExpandDims) that might not be available on mobile.
Luckily Caffe2 supports 4D tensor for FC, so we can remove those reshape
after exporting ONNX model.
"""
from
caffe2.python
import
core
# find all reshape sub-graph that can be removed, which is now all Reshape
# sub-graph whose output is only consumed by FC.
# TODO: to make it safer, we may need the actually value to better determine
# if a Reshape before FC is removable.
reshape_sub_graphs
=
identify_reshape_sub_graph
(
predict_net
)
sub_graphs_to_remove
=
[]
for
reshape_sub_graph
in
reshape_sub_graphs
:
reshape_op_id
=
reshape_sub_graph
[
-
1
]
assert
predict_net
.
op
[
reshape_op_id
].
type
==
"Reshape"
ssa
,
_
=
core
.
get_ssa
(
predict_net
)
reshape_output
=
ssa
[
reshape_op_id
][
1
][
0
]
consumers
=
[
i
for
i
in
range
(
len
(
ssa
))
if
reshape_output
in
ssa
[
i
][
0
]]
if
all
(
predict_net
.
op
[
consumer
].
type
==
"FC"
for
consumer
in
consumers
):
# safety check if the sub-graph is isolated, for this reshape sub-graph,
# it means it has one non-param external input and one external output.
ext_inputs
,
ext_outputs
=
get_sub_graph_external_input_output
(
predict_net
,
reshape_sub_graph
)
non_params_ext_inputs
=
[
inp
for
inp
in
ext_inputs
if
inp
[
1
]
!=
0
]
if
len
(
non_params_ext_inputs
)
==
1
and
len
(
ext_outputs
)
==
1
:
sub_graphs_to_remove
.
append
(
reshape_sub_graph
)
# perform removing subgraph by:
# 1: rename the Reshape's output to its input, then the graph can be
# seen as in-place itentify, meaning whose external input/output are the same.
# 2: simply remove those ops.
remove_op_ids
=
[]
params_to_remove
=
[]
for
sub_graph
in
sub_graphs_to_remove
:
logger
.
info
(
"Remove Reshape sub-graph:
\n
{}"
.
format
(
""
.
join
([
"(#{:>4})
\n
{}"
.
format
(
i
,
predict_net
.
op
[
i
])
for
i
in
sub_graph
])
)
)
reshape_op_id
=
sub_graph
[
-
1
]
new_reshap_output
=
predict_net
.
op
[
reshape_op_id
].
input
[
0
]
rename_op_output
(
predict_net
,
reshape_op_id
,
0
,
new_reshap_output
)
ext_inputs
,
ext_outputs
=
get_sub_graph_external_input_output
(
predict_net
,
sub_graph
)
non_params_ext_inputs
=
[
inp
for
inp
in
ext_inputs
if
inp
[
1
]
!=
0
]
params_ext_inputs
=
[
inp
for
inp
in
ext_inputs
if
inp
[
1
]
==
0
]
assert
len
(
non_params_ext_inputs
)
==
1
and
len
(
ext_outputs
)
==
1
assert
ext_outputs
[
0
][
0
]
==
non_params_ext_inputs
[
0
][
0
]
assert
ext_outputs
[
0
][
1
]
==
non_params_ext_inputs
[
0
][
1
]
+
1
remove_op_ids
.
extend
(
sub_graph
)
params_to_remove
.
extend
(
params_ext_inputs
)
predict_net
=
copy
.
deepcopy
(
predict_net
)
new_ops
=
[
op
for
i
,
op
in
enumerate
(
predict_net
.
op
)
if
i
not
in
remove_op_ids
]
del
predict_net
.
op
[:]
predict_net
.
op
.
extend
(
new_ops
)
for
versioned_params
in
params_to_remove
:
name
=
versioned_params
[
0
]
logger
.
info
(
"Remove params: {} from init_net and predict_net.external_input"
.
format
(
name
))
del
params
[
name
]
predict_net
.
external_input
.
remove
(
name
)
return
predict_net
,
params
def
fuse_copy_between_cpu_and_gpu
(
predict_net
:
caffe2_pb2
.
NetDef
):
"""
In-place fuse extra copy ops between cpu/gpu for the following case:
a -CopyAToB-> b -CopyBToA> c1 -NextOp1-> d1
-CopyBToA> c2 -NextOp2-> d2
The fused network will look like:
a -NextOp1-> d1
-NextOp2-> d2
"""
_COPY_OPS
=
[
"CopyCPUToGPU"
,
"CopyGPUToCPU"
]
def
_fuse_once
(
predict_net
):
ssa
,
blob_versions
=
core
.
get_ssa
(
predict_net
)
consumer_map
=
get_consumer_map
(
ssa
)
versioned_external_output
=
[
(
name
,
blob_versions
[
name
])
for
name
in
predict_net
.
external_output
]
for
op_id
,
op
in
enumerate
(
predict_net
.
op
):
if
op
.
type
in
_COPY_OPS
:
fw_copy_versioned_output
=
ssa
[
op_id
][
1
][
0
]
consumer_ids
=
[
x
[
0
]
for
x
in
consumer_map
[
fw_copy_versioned_output
]]
reverse_op_type
=
_COPY_OPS
[
1
-
_COPY_OPS
.
index
(
op
.
type
)]
is_fusable
=
(
len
(
consumer_ids
)
>
0
and
fw_copy_versioned_output
not
in
versioned_external_output
and
all
(
predict_net
.
op
[
_op_id
].
type
==
reverse_op_type
and
ssa
[
_op_id
][
1
][
0
]
not
in
versioned_external_output
for
_op_id
in
consumer_ids
)
)
if
is_fusable
:
for
rv_copy_op_id
in
consumer_ids
:
# making each NextOp uses "a" directly and removing Copy ops
rs_copy_versioned_output
=
ssa
[
rv_copy_op_id
][
1
][
0
]
next_op_id
,
inp_id
=
consumer_map
[
rs_copy_versioned_output
][
0
]
predict_net
.
op
[
next_op_id
].
input
[
inp_id
]
=
op
.
input
[
0
]
# remove CopyOps
new_ops
=
[
op
for
i
,
op
in
enumerate
(
predict_net
.
op
)
if
i
!=
op_id
and
i
not
in
consumer_ids
]
del
predict_net
.
op
[:]
predict_net
.
op
.
extend
(
new_ops
)
return
True
return
False
# _fuse_once returns False is nothing can be fused
while
_fuse_once
(
predict_net
):
pass
def
remove_dead_end_ops
(
net_def
:
caffe2_pb2
.
NetDef
):
"""remove ops if its output is not used or not in external_output"""
ssa
,
versions
=
core
.
get_ssa
(
net_def
)
versioned_external_output
=
[(
name
,
versions
[
name
])
for
name
in
net_def
.
external_output
]
consumer_map
=
get_consumer_map
(
ssa
)
removed_op_ids
=
set
()
def
_is_dead_end
(
versioned_blob
):
return
not
(
versioned_blob
in
versioned_external_output
or
(
len
(
consumer_map
[
versioned_blob
])
>
0
and
all
(
x
[
0
]
not
in
removed_op_ids
for
x
in
consumer_map
[
versioned_blob
])
)
)
for
i
,
ssa_i
in
reversed
(
list
(
enumerate
(
ssa
))):
versioned_outputs
=
ssa_i
[
1
]
if
all
(
_is_dead_end
(
outp
)
for
outp
in
versioned_outputs
):
removed_op_ids
.
add
(
i
)
# simply removing those deadend ops should have no effect to external_output
new_ops
=
[
op
for
i
,
op
in
enumerate
(
net_def
.
op
)
if
i
not
in
removed_op_ids
]
del
net_def
.
op
[:]
net_def
.
op
.
extend
(
new_ops
)
data_generation/grit/third_party/CenterNet2/detectron2/export/torchscript.py
0 → 100644
View file @
b1e6136c
# Copyright (c) Facebook, Inc. and its affiliates.
import
os
import
torch
from
detectron2.utils.file_io
import
PathManager
from
.torchscript_patch
import
freeze_training_mode
,
patch_instances
__all__
=
[
"scripting_with_instances"
,
"dump_torchscript_IR"
]
def
scripting_with_instances
(
model
,
fields
):
"""
Run :func:`torch.jit.script` on a model that uses the :class:`Instances` class. Since
attributes of :class:`Instances` are "dynamically" added in eager mode,it is difficult
for scripting to support it out of the box. This function is made to support scripting
a model that uses :class:`Instances`. It does the following:
1. Create a scriptable ``new_Instances`` class which behaves similarly to ``Instances``,
but with all attributes been "static".
The attributes need to be statically declared in the ``fields`` argument.
2. Register ``new_Instances``, and force scripting compiler to
use it when trying to compile ``Instances``.
After this function, the process will be reverted. User should be able to script another model
using different fields.
Example:
Assume that ``Instances`` in the model consist of two attributes named
``proposal_boxes`` and ``objectness_logits`` with type :class:`Boxes` and
:class:`Tensor` respectively during inference. You can call this function like:
::
fields = {"proposal_boxes": Boxes, "objectness_logits": torch.Tensor}
torchscipt_model = scripting_with_instances(model, fields)
Note:
It only support models in evaluation mode.
Args:
model (nn.Module): The input model to be exported by scripting.
fields (Dict[str, type]): Attribute names and corresponding type that
``Instances`` will use in the model. Note that all attributes used in ``Instances``
need to be added, regardless of whether they are inputs/outputs of the model.
Data type not defined in detectron2 is not supported for now.
Returns:
torch.jit.ScriptModule: the model in torchscript format
"""
assert
(
not
model
.
training
),
"Currently we only support exporting models in evaluation mode to torchscript"
with
freeze_training_mode
(
model
),
patch_instances
(
fields
):
scripted_model
=
torch
.
jit
.
script
(
model
)
return
scripted_model
# alias for old name
export_torchscript_with_instances
=
scripting_with_instances
def
dump_torchscript_IR
(
model
,
dir
):
"""
Dump IR of a TracedModule/ScriptModule/Function in various format (code, graph,
inlined graph). Useful for debugging.
Args:
model (TracedModule/ScriptModule/ScriptFUnction): traced or scripted module
dir (str): output directory to dump files.
"""
dir
=
os
.
path
.
expanduser
(
dir
)
PathManager
.
mkdirs
(
dir
)
def
_get_script_mod
(
mod
):
if
isinstance
(
mod
,
torch
.
jit
.
TracedModule
):
return
mod
.
_actual_script_module
return
mod
# Dump pretty-printed code: https://pytorch.org/docs/stable/jit.html#inspecting-code
with
PathManager
.
open
(
os
.
path
.
join
(
dir
,
"model_ts_code.txt"
),
"w"
)
as
f
:
def
get_code
(
mod
):
# Try a few ways to get code using private attributes.
try
:
# This contains more information than just `mod.code`
return
_get_script_mod
(
mod
).
_c
.
code
except
AttributeError
:
pass
try
:
return
mod
.
code
except
AttributeError
:
return
None
def
dump_code
(
prefix
,
mod
):
code
=
get_code
(
mod
)
name
=
prefix
or
"root model"
if
code
is
None
:
f
.
write
(
f
"Could not found code for
{
name
}
(type=
{
mod
.
original_name
}
)
\n
"
)
f
.
write
(
"
\n
"
)
else
:
f
.
write
(
f
"
\n
Code for
{
name
}
, type=
{
mod
.
original_name
}
:
\n
"
)
f
.
write
(
code
)
f
.
write
(
"
\n
"
)
f
.
write
(
"-"
*
80
)
for
name
,
m
in
mod
.
named_children
():
dump_code
(
prefix
+
"."
+
name
,
m
)
if
isinstance
(
model
,
torch
.
jit
.
ScriptFunction
):
f
.
write
(
get_code
(
model
))
else
:
dump_code
(
""
,
model
)
def
_get_graph
(
model
):
try
:
# Recursively dump IR of all modules
return
_get_script_mod
(
model
).
_c
.
dump_to_str
(
True
,
False
,
False
)
except
AttributeError
:
return
model
.
graph
.
str
()
with
PathManager
.
open
(
os
.
path
.
join
(
dir
,
"model_ts_IR.txt"
),
"w"
)
as
f
:
f
.
write
(
_get_graph
(
model
))
# Dump IR of the entire graph (all submodules inlined)
with
PathManager
.
open
(
os
.
path
.
join
(
dir
,
"model_ts_IR_inlined.txt"
),
"w"
)
as
f
:
f
.
write
(
str
(
model
.
inlined_graph
))
if
not
isinstance
(
model
,
torch
.
jit
.
ScriptFunction
):
# Dump the model structure in pytorch style
with
PathManager
.
open
(
os
.
path
.
join
(
dir
,
"model.txt"
),
"w"
)
as
f
:
f
.
write
(
str
(
model
))
data_generation/grit/third_party/CenterNet2/detectron2/export/torchscript_patch.py
0 → 100644
View file @
b1e6136c
# Copyright (c) Facebook, Inc. and its affiliates.
import
os
import
sys
import
tempfile
from
contextlib
import
ExitStack
,
contextmanager
from
copy
import
deepcopy
from
unittest
import
mock
import
torch
from
torch
import
nn
# need some explicit imports due to https://github.com/pytorch/pytorch/issues/38964
import
detectron2
# noqa F401
from
detectron2.structures
import
Boxes
,
Instances
from
detectron2.utils.env
import
_import_file
_counter
=
0
def
_clear_jit_cache
():
from
torch.jit._recursive
import
concrete_type_store
from
torch.jit._state
import
_jit_caching_layer
concrete_type_store
.
type_store
.
clear
()
# for modules
_jit_caching_layer
.
clear
()
# for free functions
def
_add_instances_conversion_methods
(
newInstances
):
"""
Add from_instances methods to the scripted Instances class.
"""
cls_name
=
newInstances
.
__name__
@
torch
.
jit
.
unused
def
from_instances
(
instances
:
Instances
):
"""
Create scripted Instances from original Instances
"""
fields
=
instances
.
get_fields
()
image_size
=
instances
.
image_size
ret
=
newInstances
(
image_size
)
for
name
,
val
in
fields
.
items
():
assert
hasattr
(
ret
,
f
"_
{
name
}
"
),
f
"No attribute named
{
name
}
in
{
cls_name
}
"
setattr
(
ret
,
name
,
deepcopy
(
val
))
return
ret
newInstances
.
from_instances
=
from_instances
@
contextmanager
def
patch_instances
(
fields
):
"""
A contextmanager, under which the Instances class in detectron2 is replaced
by a statically-typed scriptable class, defined by `fields`.
See more in `scripting_with_instances`.
"""
with
tempfile
.
TemporaryDirectory
(
prefix
=
"detectron2"
)
as
dir
,
tempfile
.
NamedTemporaryFile
(
mode
=
"w"
,
encoding
=
"utf-8"
,
suffix
=
".py"
,
dir
=
dir
,
delete
=
False
)
as
f
:
try
:
# Objects that use Instances should not reuse previously-compiled
# results in cache, because `Instances` could be a new class each time.
_clear_jit_cache
()
cls_name
,
s
=
_gen_instance_module
(
fields
)
f
.
write
(
s
)
f
.
flush
()
f
.
close
()
module
=
_import
(
f
.
name
)
new_instances
=
getattr
(
module
,
cls_name
)
_
=
torch
.
jit
.
script
(
new_instances
)
# let torchscript think Instances was scripted already
Instances
.
__torch_script_class__
=
True
# let torchscript find new_instances when looking for the jit type of Instances
Instances
.
_jit_override_qualname
=
torch
.
_jit_internal
.
_qualified_name
(
new_instances
)
_add_instances_conversion_methods
(
new_instances
)
yield
new_instances
finally
:
try
:
del
Instances
.
__torch_script_class__
del
Instances
.
_jit_override_qualname
except
AttributeError
:
pass
sys
.
modules
.
pop
(
module
.
__name__
)
def
_gen_instance_class
(
fields
):
"""
Args:
fields (dict[name: type])
"""
class
_FieldType
:
def
__init__
(
self
,
name
,
type_
):
assert
isinstance
(
name
,
str
),
f
"Field name must be str, got
{
name
}
"
self
.
name
=
name
self
.
type_
=
type_
self
.
annotation
=
f
"
{
type_
.
__module__
}
.
{
type_
.
__name__
}
"
fields
=
[
_FieldType
(
k
,
v
)
for
k
,
v
in
fields
.
items
()]
def
indent
(
level
,
s
):
return
" "
*
4
*
level
+
s
lines
=
[]
global
_counter
_counter
+=
1
cls_name
=
"ScriptedInstances{}"
.
format
(
_counter
)
field_names
=
tuple
(
x
.
name
for
x
in
fields
)
extra_args
=
", "
.
join
([
f
"
{
f
.
name
}
: Optional[
{
f
.
annotation
}
] = None"
for
f
in
fields
])
lines
.
append
(
f
"""
class
{
cls_name
}
:
def __init__(self, image_size: Tuple[int, int],
{
extra_args
}
):
self.image_size = image_size
self._field_names =
{
field_names
}
"""
)
for
f
in
fields
:
lines
.
append
(
indent
(
2
,
f
"self._
{
f
.
name
}
= torch.jit.annotate(Optional[
{
f
.
annotation
}
],
{
f
.
name
}
)"
)
)
for
f
in
fields
:
lines
.
append
(
f
"""
@property
def
{
f
.
name
}
(self) ->
{
f
.
annotation
}
:
# has to use a local for type refinement
# https://pytorch.org/docs/stable/jit_language_reference.html#optional-type-refinement
t = self._
{
f
.
name
}
assert t is not None, "
{
f
.
name
}
is None and cannot be accessed!"
return t
@
{
f
.
name
}
.setter
def
{
f
.
name
}
(self, value:
{
f
.
annotation
}
) -> None:
self._
{
f
.
name
}
= value
"""
)
# support method `__len__`
lines
.
append
(
"""
def __len__(self) -> int:
"""
)
for
f
in
fields
:
lines
.
append
(
f
"""
t = self._
{
f
.
name
}
if t is not None:
return len(t)
"""
)
lines
.
append
(
"""
raise NotImplementedError("Empty Instances does not support __len__!")
"""
)
# support method `has`
lines
.
append
(
"""
def has(self, name: str) -> bool:
"""
)
for
f
in
fields
:
lines
.
append
(
f
"""
if name == "
{
f
.
name
}
":
return self._
{
f
.
name
}
is not None
"""
)
lines
.
append
(
"""
return False
"""
)
# support method `to`
none_args
=
", None"
*
len
(
fields
)
lines
.
append
(
f
"""
def to(self, device: torch.device) -> "
{
cls_name
}
":
ret =
{
cls_name
}
(self.image_size
{
none_args
}
)
"""
)
for
f
in
fields
:
if
hasattr
(
f
.
type_
,
"to"
):
lines
.
append
(
f
"""
t = self._
{
f
.
name
}
if t is not None:
ret._
{
f
.
name
}
= t.to(device)
"""
)
else
:
# For now, ignore fields that cannot be moved to devices.
# Maybe can support other tensor-like classes (e.g. __torch_function__)
pass
lines
.
append
(
"""
return ret
"""
)
# support method `getitem`
none_args
=
", None"
*
len
(
fields
)
lines
.
append
(
f
"""
def __getitem__(self, item) -> "
{
cls_name
}
":
ret =
{
cls_name
}
(self.image_size
{
none_args
}
)
"""
)
for
f
in
fields
:
lines
.
append
(
f
"""
t = self._
{
f
.
name
}
if t is not None:
ret._
{
f
.
name
}
= t[item]
"""
)
lines
.
append
(
"""
return ret
"""
)
# support method `cat`
# this version does not contain checks that all instances have same size and fields
none_args
=
", None"
*
len
(
fields
)
lines
.
append
(
f
"""
def cat(self, instances: List["
{
cls_name
}
"]) -> "
{
cls_name
}
":
ret =
{
cls_name
}
(self.image_size
{
none_args
}
)
"""
)
for
f
in
fields
:
lines
.
append
(
f
"""
t = self._
{
f
.
name
}
if t is not None:
values: List[
{
f
.
annotation
}
] = [x.
{
f
.
name
}
for x in instances]
if torch.jit.isinstance(t, torch.Tensor):
ret._
{
f
.
name
}
= torch.cat(values, dim=0)
else:
ret._
{
f
.
name
}
= t.cat(values)
"""
)
lines
.
append
(
"""
return ret"""
)
# support method `get_fields()`
lines
.
append
(
"""
def get_fields(self) -> Dict[str, Tensor]:
ret = {}
"""
)
for
f
in
fields
:
if
f
.
type_
==
Boxes
:
stmt
=
"t.tensor"
elif
f
.
type_
==
torch
.
Tensor
:
stmt
=
"t"
else
:
stmt
=
f
'assert False, "unsupported type
{
str
(
f
.
type_
)
}
"'
lines
.
append
(
f
"""
t = self._
{
f
.
name
}
if t is not None:
ret["
{
f
.
name
}
"] =
{
stmt
}
"""
)
lines
.
append
(
"""
return ret"""
)
return
cls_name
,
os
.
linesep
.
join
(
lines
)
def
_gen_instance_module
(
fields
):
# TODO: find a more automatic way to enable import of other classes
s
=
"""
from copy import deepcopy
import torch
from torch import Tensor
import typing
from typing import *
import detectron2
from detectron2.structures import Boxes, Instances
"""
cls_name
,
cls_def
=
_gen_instance_class
(
fields
)
s
+=
cls_def
return
cls_name
,
s
def
_import
(
path
):
return
_import_file
(
"{}{}"
.
format
(
sys
.
modules
[
__name__
].
__name__
,
_counter
),
path
,
make_importable
=
True
)
@
contextmanager
def
patch_builtin_len
(
modules
=
()):
"""
Patch the builtin len() function of a few detectron2 modules
to use __len__ instead, because __len__ does not convert values to
integers and therefore is friendly to tracing.
Args:
modules (list[stsr]): names of extra modules to patch len(), in
addition to those in detectron2.
"""
def
_new_len
(
obj
):
return
obj
.
__len__
()
with
ExitStack
()
as
stack
:
MODULES
=
[
"detectron2.modeling.roi_heads.fast_rcnn"
,
"detectron2.modeling.roi_heads.mask_head"
,
"detectron2.modeling.roi_heads.keypoint_head"
,
]
+
list
(
modules
)
ctxs
=
[
stack
.
enter_context
(
mock
.
patch
(
mod
+
".len"
))
for
mod
in
MODULES
]
for
m
in
ctxs
:
m
.
side_effect
=
_new_len
yield
def
patch_nonscriptable_classes
():
"""
Apply patches on a few nonscriptable detectron2 classes.
Should not have side-effects on eager usage.
"""
# __prepare_scriptable__ can also be added to models for easier maintenance.
# But it complicates the clean model code.
from
detectron2.modeling.backbone
import
ResNet
,
FPN
# Due to https://github.com/pytorch/pytorch/issues/36061,
# we change backbone to use ModuleList for scripting.
# (note: this changes param names in state_dict)
def
prepare_resnet
(
self
):
ret
=
deepcopy
(
self
)
ret
.
stages
=
nn
.
ModuleList
(
ret
.
stages
)
for
k
in
self
.
stage_names
:
delattr
(
ret
,
k
)
return
ret
ResNet
.
__prepare_scriptable__
=
prepare_resnet
def
prepare_fpn
(
self
):
ret
=
deepcopy
(
self
)
ret
.
lateral_convs
=
nn
.
ModuleList
(
ret
.
lateral_convs
)
ret
.
output_convs
=
nn
.
ModuleList
(
ret
.
output_convs
)
for
name
,
_
in
self
.
named_children
():
if
name
.
startswith
(
"fpn_"
):
delattr
(
ret
,
name
)
return
ret
FPN
.
__prepare_scriptable__
=
prepare_fpn
# Annotate some attributes to be constants for the purpose of scripting,
# even though they are not constants in eager mode.
from
detectron2.modeling.roi_heads
import
StandardROIHeads
if
hasattr
(
StandardROIHeads
,
"__annotations__"
):
# copy first to avoid editing annotations of base class
StandardROIHeads
.
__annotations__
=
deepcopy
(
StandardROIHeads
.
__annotations__
)
StandardROIHeads
.
__annotations__
[
"mask_on"
]
=
torch
.
jit
.
Final
[
bool
]
StandardROIHeads
.
__annotations__
[
"keypoint_on"
]
=
torch
.
jit
.
Final
[
bool
]
# These patches are not supposed to have side-effects.
patch_nonscriptable_classes
()
@
contextmanager
def
freeze_training_mode
(
model
):
"""
A context manager that annotates the "training" attribute of every submodule
to constant, so that the training codepath in these modules can be
meta-compiled away. Upon exiting, the annotations are reverted.
"""
classes
=
{
type
(
x
)
for
x
in
model
.
modules
()}
# __constants__ is the old way to annotate constants and not compatible
# with __annotations__ .
classes
=
{
x
for
x
in
classes
if
not
hasattr
(
x
,
"__constants__"
)}
for
cls
in
classes
:
cls
.
__annotations__
[
"training"
]
=
torch
.
jit
.
Final
[
bool
]
yield
for
cls
in
classes
:
cls
.
__annotations__
[
"training"
]
=
bool
data_generation/grit/third_party/CenterNet2/detectron2/layers/__init__.py
0 → 100644
View file @
b1e6136c
# Copyright (c) Facebook, Inc. and its affiliates.
from
.batch_norm
import
FrozenBatchNorm2d
,
get_norm
,
NaiveSyncBatchNorm
,
CycleBatchNormList
from
.deform_conv
import
DeformConv
,
ModulatedDeformConv
from
.mask_ops
import
paste_masks_in_image
from
.nms
import
batched_nms
,
batched_nms_rotated
,
nms
,
nms_rotated
from
.roi_align
import
ROIAlign
,
roi_align
from
.roi_align_rotated
import
ROIAlignRotated
,
roi_align_rotated
from
.shape_spec
import
ShapeSpec
from
.wrappers
import
(
BatchNorm2d
,
Conv2d
,
ConvTranspose2d
,
cat
,
interpolate
,
Linear
,
nonzero_tuple
,
cross_entropy
,
shapes_to_tensor
,
)
from
.blocks
import
CNNBlockBase
,
DepthwiseSeparableConv2d
from
.aspp
import
ASPP
from
.losses
import
ciou_loss
,
diou_loss
__all__
=
[
k
for
k
in
globals
().
keys
()
if
not
k
.
startswith
(
"_"
)]
data_generation/grit/third_party/CenterNet2/detectron2/layers/aspp.py
0 → 100644
View file @
b1e6136c
# Copyright (c) Facebook, Inc. and its affiliates.
from
copy
import
deepcopy
import
fvcore.nn.weight_init
as
weight_init
import
torch
from
torch
import
nn
from
torch.nn
import
functional
as
F
from
.batch_norm
import
get_norm
from
.blocks
import
DepthwiseSeparableConv2d
from
.wrappers
import
Conv2d
class
ASPP
(
nn
.
Module
):
"""
Atrous Spatial Pyramid Pooling (ASPP).
"""
def
__init__
(
self
,
in_channels
,
out_channels
,
dilations
,
*
,
norm
,
activation
,
pool_kernel_size
=
None
,
dropout
:
float
=
0.0
,
use_depthwise_separable_conv
=
False
,
):
"""
Args:
in_channels (int): number of input channels for ASPP.
out_channels (int): number of output channels.
dilations (list): a list of 3 dilations in ASPP.
norm (str or callable): normalization for all conv layers.
See :func:`layers.get_norm` for supported format. norm is
applied to all conv layers except the conv following
global average pooling.
activation (callable): activation function.
pool_kernel_size (tuple, list): the average pooling size (kh, kw)
for image pooling layer in ASPP. If set to None, it always
performs global average pooling. If not None, it must be
divisible by the shape of inputs in forward(). It is recommended
to use a fixed input feature size in training, and set this
option to match this size, so that it performs global average
pooling in training, and the size of the pooling window stays
consistent in inference.
dropout (float): apply dropout on the output of ASPP. It is used in
the official DeepLab implementation with a rate of 0.1:
https://github.com/tensorflow/models/blob/21b73d22f3ed05b650e85ac50849408dd36de32e/research/deeplab/model.py#L532 # noqa
use_depthwise_separable_conv (bool): use DepthwiseSeparableConv2d
for 3x3 convs in ASPP, proposed in :paper:`DeepLabV3+`.
"""
super
(
ASPP
,
self
).
__init__
()
assert
len
(
dilations
)
==
3
,
"ASPP expects 3 dilations, got {}"
.
format
(
len
(
dilations
))
self
.
pool_kernel_size
=
pool_kernel_size
self
.
dropout
=
dropout
use_bias
=
norm
==
""
self
.
convs
=
nn
.
ModuleList
()
# conv 1x1
self
.
convs
.
append
(
Conv2d
(
in_channels
,
out_channels
,
kernel_size
=
1
,
bias
=
use_bias
,
norm
=
get_norm
(
norm
,
out_channels
),
activation
=
deepcopy
(
activation
),
)
)
weight_init
.
c2_xavier_fill
(
self
.
convs
[
-
1
])
# atrous convs
for
dilation
in
dilations
:
if
use_depthwise_separable_conv
:
self
.
convs
.
append
(
DepthwiseSeparableConv2d
(
in_channels
,
out_channels
,
kernel_size
=
3
,
padding
=
dilation
,
dilation
=
dilation
,
norm1
=
norm
,
activation1
=
deepcopy
(
activation
),
norm2
=
norm
,
activation2
=
deepcopy
(
activation
),
)
)
else
:
self
.
convs
.
append
(
Conv2d
(
in_channels
,
out_channels
,
kernel_size
=
3
,
padding
=
dilation
,
dilation
=
dilation
,
bias
=
use_bias
,
norm
=
get_norm
(
norm
,
out_channels
),
activation
=
deepcopy
(
activation
),
)
)
weight_init
.
c2_xavier_fill
(
self
.
convs
[
-
1
])
# image pooling
# We do not add BatchNorm because the spatial resolution is 1x1,
# the original TF implementation has BatchNorm.
if
pool_kernel_size
is
None
:
image_pooling
=
nn
.
Sequential
(
nn
.
AdaptiveAvgPool2d
(
1
),
Conv2d
(
in_channels
,
out_channels
,
1
,
bias
=
True
,
activation
=
deepcopy
(
activation
)),
)
else
:
image_pooling
=
nn
.
Sequential
(
nn
.
AvgPool2d
(
kernel_size
=
pool_kernel_size
,
stride
=
1
),
Conv2d
(
in_channels
,
out_channels
,
1
,
bias
=
True
,
activation
=
deepcopy
(
activation
)),
)
weight_init
.
c2_xavier_fill
(
image_pooling
[
1
])
self
.
convs
.
append
(
image_pooling
)
self
.
project
=
Conv2d
(
5
*
out_channels
,
out_channels
,
kernel_size
=
1
,
bias
=
use_bias
,
norm
=
get_norm
(
norm
,
out_channels
),
activation
=
deepcopy
(
activation
),
)
weight_init
.
c2_xavier_fill
(
self
.
project
)
def
forward
(
self
,
x
):
size
=
x
.
shape
[
-
2
:]
if
self
.
pool_kernel_size
is
not
None
:
if
size
[
0
]
%
self
.
pool_kernel_size
[
0
]
or
size
[
1
]
%
self
.
pool_kernel_size
[
1
]:
raise
ValueError
(
"`pool_kernel_size` must be divisible by the shape of inputs. "
"Input size: {} `pool_kernel_size`: {}"
.
format
(
size
,
self
.
pool_kernel_size
)
)
res
=
[]
for
conv
in
self
.
convs
:
res
.
append
(
conv
(
x
))
res
[
-
1
]
=
F
.
interpolate
(
res
[
-
1
],
size
=
size
,
mode
=
"bilinear"
,
align_corners
=
False
)
res
=
torch
.
cat
(
res
,
dim
=
1
)
res
=
self
.
project
(
res
)
res
=
F
.
dropout
(
res
,
self
.
dropout
,
training
=
self
.
training
)
if
self
.
dropout
>
0
else
res
return
res
data_generation/grit/third_party/CenterNet2/detectron2/layers/batch_norm.py
0 → 100644
View file @
b1e6136c
# Copyright (c) Facebook, Inc. and its affiliates.
import
torch
import
torch.distributed
as
dist
from
fvcore.nn.distributed
import
differentiable_all_reduce
from
torch
import
nn
from
torch.nn
import
functional
as
F
from
detectron2.utils
import
comm
,
env
from
.wrappers
import
BatchNorm2d
class
FrozenBatchNorm2d
(
nn
.
Module
):
"""
BatchNorm2d where the batch statistics and the affine parameters are fixed.
It contains non-trainable buffers called
"weight" and "bias", "running_mean", "running_var",
initialized to perform identity transformation.
The pre-trained backbone models from Caffe2 only contain "weight" and "bias",
which are computed from the original four parameters of BN.
The affine transform `x * weight + bias` will perform the equivalent
computation of `(x - running_mean) / sqrt(running_var) * weight + bias`.
When loading a backbone model from Caffe2, "running_mean" and "running_var"
will be left unchanged as identity transformation.
Other pre-trained backbone models may contain all 4 parameters.
The forward is implemented by `F.batch_norm(..., training=False)`.
"""
_version
=
3
def
__init__
(
self
,
num_features
,
eps
=
1e-5
):
super
().
__init__
()
self
.
num_features
=
num_features
self
.
eps
=
eps
self
.
register_buffer
(
"weight"
,
torch
.
ones
(
num_features
))
self
.
register_buffer
(
"bias"
,
torch
.
zeros
(
num_features
))
self
.
register_buffer
(
"running_mean"
,
torch
.
zeros
(
num_features
))
self
.
register_buffer
(
"running_var"
,
torch
.
ones
(
num_features
)
-
eps
)
def
forward
(
self
,
x
):
if
x
.
requires_grad
:
# When gradients are needed, F.batch_norm will use extra memory
# because its backward op computes gradients for weight/bias as well.
scale
=
self
.
weight
*
(
self
.
running_var
+
self
.
eps
).
rsqrt
()
bias
=
self
.
bias
-
self
.
running_mean
*
scale
scale
=
scale
.
reshape
(
1
,
-
1
,
1
,
1
)
bias
=
bias
.
reshape
(
1
,
-
1
,
1
,
1
)
out_dtype
=
x
.
dtype
# may be half
return
x
*
scale
.
to
(
out_dtype
)
+
bias
.
to
(
out_dtype
)
else
:
# When gradients are not needed, F.batch_norm is a single fused op
# and provide more optimization opportunities.
return
F
.
batch_norm
(
x
,
self
.
running_mean
,
self
.
running_var
,
self
.
weight
,
self
.
bias
,
training
=
False
,
eps
=
self
.
eps
,
)
def
_load_from_state_dict
(
self
,
state_dict
,
prefix
,
local_metadata
,
strict
,
missing_keys
,
unexpected_keys
,
error_msgs
):
version
=
local_metadata
.
get
(
"version"
,
None
)
if
version
is
None
or
version
<
2
:
# No running_mean/var in early versions
# This will silent the warnings
if
prefix
+
"running_mean"
not
in
state_dict
:
state_dict
[
prefix
+
"running_mean"
]
=
torch
.
zeros_like
(
self
.
running_mean
)
if
prefix
+
"running_var"
not
in
state_dict
:
state_dict
[
prefix
+
"running_var"
]
=
torch
.
ones_like
(
self
.
running_var
)
super
().
_load_from_state_dict
(
state_dict
,
prefix
,
local_metadata
,
strict
,
missing_keys
,
unexpected_keys
,
error_msgs
)
def
__repr__
(
self
):
return
"FrozenBatchNorm2d(num_features={}, eps={})"
.
format
(
self
.
num_features
,
self
.
eps
)
@
classmethod
def
convert_frozen_batchnorm
(
cls
,
module
):
"""
Convert all BatchNorm/SyncBatchNorm in module into FrozenBatchNorm.
Args:
module (torch.nn.Module):
Returns:
If module is BatchNorm/SyncBatchNorm, returns a new module.
Otherwise, in-place convert module and return it.
Similar to convert_sync_batchnorm in
https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/batchnorm.py
"""
bn_module
=
nn
.
modules
.
batchnorm
bn_module
=
(
bn_module
.
BatchNorm2d
,
bn_module
.
SyncBatchNorm
)
res
=
module
if
isinstance
(
module
,
bn_module
):
res
=
cls
(
module
.
num_features
)
if
module
.
affine
:
res
.
weight
.
data
=
module
.
weight
.
data
.
clone
().
detach
()
res
.
bias
.
data
=
module
.
bias
.
data
.
clone
().
detach
()
res
.
running_mean
.
data
=
module
.
running_mean
.
data
res
.
running_var
.
data
=
module
.
running_var
.
data
res
.
eps
=
module
.
eps
else
:
for
name
,
child
in
module
.
named_children
():
new_child
=
cls
.
convert_frozen_batchnorm
(
child
)
if
new_child
is
not
child
:
res
.
add_module
(
name
,
new_child
)
return
res
def
get_norm
(
norm
,
out_channels
):
"""
Args:
norm (str or callable): either one of BN, SyncBN, FrozenBN, GN;
or a callable that takes a channel number and returns
the normalization layer as a nn.Module.
Returns:
nn.Module or None: the normalization layer
"""
if
norm
is
None
:
return
None
if
isinstance
(
norm
,
str
):
if
len
(
norm
)
==
0
:
return
None
norm
=
{
"BN"
:
BatchNorm2d
,
# Fixed in https://github.com/pytorch/pytorch/pull/36382
"SyncBN"
:
NaiveSyncBatchNorm
if
env
.
TORCH_VERSION
<=
(
1
,
5
)
else
nn
.
SyncBatchNorm
,
"FrozenBN"
:
FrozenBatchNorm2d
,
"GN"
:
lambda
channels
:
nn
.
GroupNorm
(
32
,
channels
),
# for debugging:
"nnSyncBN"
:
nn
.
SyncBatchNorm
,
"naiveSyncBN"
:
NaiveSyncBatchNorm
,
# expose stats_mode N as an option to caller, required for zero-len inputs
"naiveSyncBN_N"
:
lambda
channels
:
NaiveSyncBatchNorm
(
channels
,
stats_mode
=
"N"
),
}[
norm
]
return
norm
(
out_channels
)
class
NaiveSyncBatchNorm
(
BatchNorm2d
):
"""
In PyTorch<=1.5, ``nn.SyncBatchNorm`` has incorrect gradient
when the batch size on each worker is different.
(e.g., when scale augmentation is used, or when it is applied to mask head).
This is a slower but correct alternative to `nn.SyncBatchNorm`.
Note:
There isn't a single definition of Sync BatchNorm.
When ``stats_mode==""``, this module computes overall statistics by using
statistics of each worker with equal weight. The result is true statistics
of all samples (as if they are all on one worker) only when all workers
have the same (N, H, W). This mode does not support inputs with zero batch size.
When ``stats_mode=="N"``, this module computes overall statistics by weighting
the statistics of each worker by their ``N``. The result is true statistics
of all samples (as if they are all on one worker) only when all workers
have the same (H, W). It is slower than ``stats_mode==""``.
Even though the result of this module may not be the true statistics of all samples,
it may still be reasonable because it might be preferrable to assign equal weights
to all workers, regardless of their (H, W) dimension, instead of putting larger weight
on larger images. From preliminary experiments, little difference is found between such
a simplified implementation and an accurate computation of overall mean & variance.
"""
def
__init__
(
self
,
*
args
,
stats_mode
=
""
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
assert
stats_mode
in
[
""
,
"N"
]
self
.
_stats_mode
=
stats_mode
def
forward
(
self
,
input
):
if
comm
.
get_world_size
()
==
1
or
not
self
.
training
:
return
super
().
forward
(
input
)
B
,
C
=
input
.
shape
[
0
],
input
.
shape
[
1
]
half_input
=
input
.
dtype
==
torch
.
float16
if
half_input
:
# fp16 does not have good enough numerics for the reduction here
input
=
input
.
float
()
mean
=
torch
.
mean
(
input
,
dim
=
[
0
,
2
,
3
])
meansqr
=
torch
.
mean
(
input
*
input
,
dim
=
[
0
,
2
,
3
])
if
self
.
_stats_mode
==
""
:
assert
B
>
0
,
'SyncBatchNorm(stats_mode="") does not support zero batch size.'
vec
=
torch
.
cat
([
mean
,
meansqr
],
dim
=
0
)
vec
=
differentiable_all_reduce
(
vec
)
*
(
1.0
/
dist
.
get_world_size
())
mean
,
meansqr
=
torch
.
split
(
vec
,
C
)
momentum
=
self
.
momentum
else
:
if
B
==
0
:
vec
=
torch
.
zeros
([
2
*
C
+
1
],
device
=
mean
.
device
,
dtype
=
mean
.
dtype
)
vec
=
vec
+
input
.
sum
()
# make sure there is gradient w.r.t input
else
:
vec
=
torch
.
cat
(
[
mean
,
meansqr
,
torch
.
ones
([
1
],
device
=
mean
.
device
,
dtype
=
mean
.
dtype
)],
dim
=
0
)
vec
=
differentiable_all_reduce
(
vec
*
B
)
total_batch
=
vec
[
-
1
].
detach
()
momentum
=
total_batch
.
clamp
(
max
=
1
)
*
self
.
momentum
# no update if total_batch is 0
mean
,
meansqr
,
_
=
torch
.
split
(
vec
/
total_batch
.
clamp
(
min
=
1
),
C
)
# avoid div-by-zero
var
=
meansqr
-
mean
*
mean
invstd
=
torch
.
rsqrt
(
var
+
self
.
eps
)
scale
=
self
.
weight
*
invstd
bias
=
self
.
bias
-
mean
*
scale
scale
=
scale
.
reshape
(
1
,
-
1
,
1
,
1
)
bias
=
bias
.
reshape
(
1
,
-
1
,
1
,
1
)
self
.
running_mean
+=
momentum
*
(
mean
.
detach
()
-
self
.
running_mean
)
self
.
running_var
+=
momentum
*
(
var
.
detach
()
-
self
.
running_var
)
ret
=
input
*
scale
+
bias
if
half_input
:
ret
=
ret
.
half
()
return
ret
class
CycleBatchNormList
(
nn
.
ModuleList
):
"""
Implement domain-specific BatchNorm by cycling.
When a BatchNorm layer is used for multiple input domains or input
features, it might need to maintain a separate test-time statistics
for each domain. See Sec 5.2 in :paper:`rethinking-batchnorm`.
This module implements it by using N separate BN layers
and it cycles through them every time a forward() is called.
NOTE: The caller of this module MUST guarantee to always call
this module by multiple of N times. Otherwise its test-time statistics
will be incorrect.
"""
def
__init__
(
self
,
length
:
int
,
bn_class
=
nn
.
BatchNorm2d
,
**
kwargs
):
"""
Args:
length: number of BatchNorm layers to cycle.
bn_class: the BatchNorm class to use
kwargs: arguments of the BatchNorm class, such as num_features.
"""
self
.
_affine
=
kwargs
.
pop
(
"affine"
,
True
)
super
().
__init__
([
bn_class
(
**
kwargs
,
affine
=
False
)
for
k
in
range
(
length
)])
if
self
.
_affine
:
# shared affine, domain-specific BN
channels
=
self
[
0
].
num_features
self
.
weight
=
nn
.
Parameter
(
torch
.
ones
(
channels
))
self
.
bias
=
nn
.
Parameter
(
torch
.
zeros
(
channels
))
self
.
_pos
=
0
def
forward
(
self
,
x
):
ret
=
self
[
self
.
_pos
](
x
)
self
.
_pos
=
(
self
.
_pos
+
1
)
%
len
(
self
)
if
self
.
_affine
:
w
=
self
.
weight
.
reshape
(
1
,
-
1
,
1
,
1
)
b
=
self
.
bias
.
reshape
(
1
,
-
1
,
1
,
1
)
return
ret
*
w
+
b
else
:
return
ret
def
extra_repr
(
self
):
return
f
"affine=
{
self
.
_affine
}
"
data_generation/grit/third_party/CenterNet2/detectron2/layers/blocks.py
0 → 100644
View file @
b1e6136c
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates.
import
fvcore.nn.weight_init
as
weight_init
from
torch
import
nn
from
.batch_norm
import
FrozenBatchNorm2d
,
get_norm
from
.wrappers
import
Conv2d
"""
CNN building blocks.
"""
class
CNNBlockBase
(
nn
.
Module
):
"""
A CNN block is assumed to have input channels, output channels and a stride.
The input and output of `forward()` method must be NCHW tensors.
The method can perform arbitrary computation but must match the given
channels and stride specification.
Attribute:
in_channels (int):
out_channels (int):
stride (int):
"""
def
__init__
(
self
,
in_channels
,
out_channels
,
stride
):
"""
The `__init__` method of any subclass should also contain these arguments.
Args:
in_channels (int):
out_channels (int):
stride (int):
"""
super
().
__init__
()
self
.
in_channels
=
in_channels
self
.
out_channels
=
out_channels
self
.
stride
=
stride
def
freeze
(
self
):
"""
Make this block not trainable.
This method sets all parameters to `requires_grad=False`,
and convert all BatchNorm layers to FrozenBatchNorm
Returns:
the block itself
"""
for
p
in
self
.
parameters
():
p
.
requires_grad
=
False
FrozenBatchNorm2d
.
convert_frozen_batchnorm
(
self
)
return
self
class
DepthwiseSeparableConv2d
(
nn
.
Module
):
"""
A kxk depthwise convolution + a 1x1 convolution.
In :paper:`xception`, norm & activation are applied on the second conv.
:paper:`mobilenet` uses norm & activation on both convs.
"""
def
__init__
(
self
,
in_channels
,
out_channels
,
kernel_size
=
3
,
padding
=
1
,
dilation
=
1
,
*
,
norm1
=
None
,
activation1
=
None
,
norm2
=
None
,
activation2
=
None
,
):
"""
Args:
norm1, norm2 (str or callable): normalization for the two conv layers.
activation1, activation2 (callable(Tensor) -> Tensor): activation
function for the two conv layers.
"""
super
().
__init__
()
self
.
depthwise
=
Conv2d
(
in_channels
,
in_channels
,
kernel_size
=
kernel_size
,
padding
=
padding
,
dilation
=
dilation
,
groups
=
in_channels
,
bias
=
not
norm1
,
norm
=
get_norm
(
norm1
,
in_channels
),
activation
=
activation1
,
)
self
.
pointwise
=
Conv2d
(
in_channels
,
out_channels
,
kernel_size
=
1
,
bias
=
not
norm2
,
norm
=
get_norm
(
norm2
,
out_channels
),
activation
=
activation2
,
)
# default initialization
weight_init
.
c2_msra_fill
(
self
.
depthwise
)
weight_init
.
c2_msra_fill
(
self
.
pointwise
)
def
forward
(
self
,
x
):
return
self
.
pointwise
(
self
.
depthwise
(
x
))
data_generation/grit/third_party/CenterNet2/detectron2/layers/csrc/README.md
0 → 100644
View file @
b1e6136c
To add a new Op:
1.
Create a new directory
2.
Implement new ops there
3.
Delcare its Python interface in
`vision.cpp`
.
data_generation/grit/third_party/CenterNet2/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h
0 → 100644
View file @
b1e6136c
// Copyright (c) Facebook, Inc. and its affiliates.
#pragma once
#include <torch/types.h>
namespace
detectron2
{
at
::
Tensor
ROIAlignRotated_forward_cpu
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
);
at
::
Tensor
ROIAlignRotated_backward_cpu
(
const
at
::
Tensor
&
grad
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
batch_size
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
sampling_ratio
);
#if defined(WITH_CUDA) || defined(WITH_HIP)
at
::
Tensor
ROIAlignRotated_forward_cuda
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
);
at
::
Tensor
ROIAlignRotated_backward_cuda
(
const
at
::
Tensor
&
grad
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
batch_size
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
sampling_ratio
);
#endif
// Interface for Python
inline
at
::
Tensor
ROIAlignRotated_forward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
rois
,
const
double
spatial_scale
,
const
int64_t
pooled_height
,
const
int64_t
pooled_width
,
const
int64_t
sampling_ratio
)
{
if
(
input
.
is_cuda
())
{
#if defined(WITH_CUDA) || defined(WITH_HIP)
return
ROIAlignRotated_forward_cuda
(
input
,
rois
,
spatial_scale
,
pooled_height
,
pooled_width
,
sampling_ratio
);
#else
AT_ERROR
(
"Detectron2 is not compiled with GPU support!"
);
#endif
}
return
ROIAlignRotated_forward_cpu
(
input
,
rois
,
spatial_scale
,
pooled_height
,
pooled_width
,
sampling_ratio
);
}
inline
at
::
Tensor
ROIAlignRotated_backward
(
const
at
::
Tensor
&
grad
,
const
at
::
Tensor
&
rois
,
const
double
spatial_scale
,
const
int64_t
pooled_height
,
const
int64_t
pooled_width
,
const
int64_t
batch_size
,
const
int64_t
channels
,
const
int64_t
height
,
const
int64_t
width
,
const
int64_t
sampling_ratio
)
{
if
(
grad
.
is_cuda
())
{
#if defined(WITH_CUDA) || defined(WITH_HIP)
return
ROIAlignRotated_backward_cuda
(
grad
,
rois
,
spatial_scale
,
pooled_height
,
pooled_width
,
batch_size
,
channels
,
height
,
width
,
sampling_ratio
);
#else
AT_ERROR
(
"Detectron2 is not compiled with GPU support!"
);
#endif
}
return
ROIAlignRotated_backward_cpu
(
grad
,
rois
,
spatial_scale
,
pooled_height
,
pooled_width
,
batch_size
,
channels
,
height
,
width
,
sampling_ratio
);
}
}
// namespace detectron2
data_generation/grit/third_party/CenterNet2/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp
0 → 100644
View file @
b1e6136c
// Copyright (c) Facebook, Inc. and its affiliates.
#include <ATen/TensorUtils.h>
#include "ROIAlignRotated.h"
// Note: this implementation originates from the Caffe2 ROIAlignRotated Op
// and PyTorch ROIAlign (non-rotated) Op implementations.
// The key difference between this implementation and those ones is
// we don't do "legacy offset" in this version, as there aren't many previous
// works, if any, using the "legacy" ROIAlignRotated Op.
// This would make the interface a bit cleaner.
namespace
detectron2
{
namespace
{
template
<
typename
T
>
struct
PreCalc
{
int
pos1
;
int
pos2
;
int
pos3
;
int
pos4
;
T
w1
;
T
w2
;
T
w3
;
T
w4
;
};
template
<
typename
T
>
void
pre_calc_for_bilinear_interpolate
(
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
iy_upper
,
const
int
ix_upper
,
T
roi_start_h
,
T
roi_start_w
,
T
bin_size_h
,
T
bin_size_w
,
int
roi_bin_grid_h
,
int
roi_bin_grid_w
,
T
roi_center_h
,
T
roi_center_w
,
T
cos_theta
,
T
sin_theta
,
std
::
vector
<
PreCalc
<
T
>>&
pre_calc
)
{
int
pre_calc_index
=
0
;
for
(
int
ph
=
0
;
ph
<
pooled_height
;
ph
++
)
{
for
(
int
pw
=
0
;
pw
<
pooled_width
;
pw
++
)
{
for
(
int
iy
=
0
;
iy
<
iy_upper
;
iy
++
)
{
const
T
yy
=
roi_start_h
+
ph
*
bin_size_h
+
static_cast
<
T
>
(
iy
+
.5
f
)
*
bin_size_h
/
static_cast
<
T
>
(
roi_bin_grid_h
);
// e.g., 0.5, 1.5
for
(
int
ix
=
0
;
ix
<
ix_upper
;
ix
++
)
{
const
T
xx
=
roi_start_w
+
pw
*
bin_size_w
+
static_cast
<
T
>
(
ix
+
.5
f
)
*
bin_size_w
/
static_cast
<
T
>
(
roi_bin_grid_w
);
// Rotate by theta around the center and translate
// In image space, (y, x) is the order for Right Handed System,
// and this is essentially multiplying the point by a rotation matrix
// to rotate it counterclockwise through angle theta.
T
y
=
yy
*
cos_theta
-
xx
*
sin_theta
+
roi_center_h
;
T
x
=
yy
*
sin_theta
+
xx
*
cos_theta
+
roi_center_w
;
// deal with: inverse elements are out of feature map boundary
if
(
y
<
-
1.0
||
y
>
height
||
x
<
-
1.0
||
x
>
width
)
{
// empty
PreCalc
<
T
>
pc
;
pc
.
pos1
=
0
;
pc
.
pos2
=
0
;
pc
.
pos3
=
0
;
pc
.
pos4
=
0
;
pc
.
w1
=
0
;
pc
.
w2
=
0
;
pc
.
w3
=
0
;
pc
.
w4
=
0
;
pre_calc
[
pre_calc_index
]
=
pc
;
pre_calc_index
+=
1
;
continue
;
}
if
(
y
<
0
)
{
y
=
0
;
}
if
(
x
<
0
)
{
x
=
0
;
}
int
y_low
=
(
int
)
y
;
int
x_low
=
(
int
)
x
;
int
y_high
;
int
x_high
;
if
(
y_low
>=
height
-
1
)
{
y_high
=
y_low
=
height
-
1
;
y
=
(
T
)
y_low
;
}
else
{
y_high
=
y_low
+
1
;
}
if
(
x_low
>=
width
-
1
)
{
x_high
=
x_low
=
width
-
1
;
x
=
(
T
)
x_low
;
}
else
{
x_high
=
x_low
+
1
;
}
T
ly
=
y
-
y_low
;
T
lx
=
x
-
x_low
;
T
hy
=
1.
-
ly
,
hx
=
1.
-
lx
;
T
w1
=
hy
*
hx
,
w2
=
hy
*
lx
,
w3
=
ly
*
hx
,
w4
=
ly
*
lx
;
// save weights and indices
PreCalc
<
T
>
pc
;
pc
.
pos1
=
y_low
*
width
+
x_low
;
pc
.
pos2
=
y_low
*
width
+
x_high
;
pc
.
pos3
=
y_high
*
width
+
x_low
;
pc
.
pos4
=
y_high
*
width
+
x_high
;
pc
.
w1
=
w1
;
pc
.
w2
=
w2
;
pc
.
w3
=
w3
;
pc
.
w4
=
w4
;
pre_calc
[
pre_calc_index
]
=
pc
;
pre_calc_index
+=
1
;
}
}
}
}
}
template
<
typename
T
>
void
bilinear_interpolate_gradient
(
const
int
height
,
const
int
width
,
T
y
,
T
x
,
T
&
w1
,
T
&
w2
,
T
&
w3
,
T
&
w4
,
int
&
x_low
,
int
&
x_high
,
int
&
y_low
,
int
&
y_high
)
{
// deal with cases that inverse elements are out of feature map boundary
if
(
y
<
-
1.0
||
y
>
height
||
x
<
-
1.0
||
x
>
width
)
{
// empty
w1
=
w2
=
w3
=
w4
=
0.
;
x_low
=
x_high
=
y_low
=
y_high
=
-
1
;
return
;
}
if
(
y
<
0
)
{
y
=
0
;
}
if
(
x
<
0
)
{
x
=
0
;
}
y_low
=
(
int
)
y
;
x_low
=
(
int
)
x
;
if
(
y_low
>=
height
-
1
)
{
y_high
=
y_low
=
height
-
1
;
y
=
(
T
)
y_low
;
}
else
{
y_high
=
y_low
+
1
;
}
if
(
x_low
>=
width
-
1
)
{
x_high
=
x_low
=
width
-
1
;
x
=
(
T
)
x_low
;
}
else
{
x_high
=
x_low
+
1
;
}
T
ly
=
y
-
y_low
;
T
lx
=
x
-
x_low
;
T
hy
=
1.
-
ly
,
hx
=
1.
-
lx
;
// reference in forward
// T v1 = input[y_low * width + x_low];
// T v2 = input[y_low * width + x_high];
// T v3 = input[y_high * width + x_low];
// T v4 = input[y_high * width + x_high];
// T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
w1
=
hy
*
hx
,
w2
=
hy
*
lx
,
w3
=
ly
*
hx
,
w4
=
ly
*
lx
;
return
;
}
template
<
class
T
>
inline
void
add
(
T
*
address
,
const
T
&
val
)
{
*
address
+=
val
;
}
}
// namespace
template
<
typename
T
>
void
ROIAlignRotatedForward
(
const
int
nthreads
,
const
T
*
input
,
const
T
&
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
,
const
T
*
rois
,
T
*
output
)
{
int
n_rois
=
nthreads
/
channels
/
pooled_width
/
pooled_height
;
// (n, c, ph, pw) is an element in the pooled output
// can be parallelized using omp
// #pragma omp parallel for num_threads(32)
for
(
int
n
=
0
;
n
<
n_rois
;
n
++
)
{
int
index_n
=
n
*
channels
*
pooled_width
*
pooled_height
;
const
T
*
current_roi
=
rois
+
n
*
6
;
int
roi_batch_ind
=
current_roi
[
0
];
// Do not use rounding; this implementation detail is critical
// ROIAlignRotated supports align == true, i.e., continuous coordinate
// by default, thus the 0.5 offset
T
offset
=
(
T
)
0.5
;
T
roi_center_w
=
current_roi
[
1
]
*
spatial_scale
-
offset
;
T
roi_center_h
=
current_roi
[
2
]
*
spatial_scale
-
offset
;
T
roi_width
=
current_roi
[
3
]
*
spatial_scale
;
T
roi_height
=
current_roi
[
4
]
*
spatial_scale
;
T
theta
=
current_roi
[
5
]
*
M_PI
/
180.0
;
T
cos_theta
=
cos
(
theta
);
T
sin_theta
=
sin
(
theta
);
AT_ASSERTM
(
roi_width
>=
0
&&
roi_height
>=
0
,
"ROIs in ROIAlignRotated do not have non-negative size!"
);
T
bin_size_h
=
static_cast
<
T
>
(
roi_height
)
/
static_cast
<
T
>
(
pooled_height
);
T
bin_size_w
=
static_cast
<
T
>
(
roi_width
)
/
static_cast
<
T
>
(
pooled_width
);
// We use roi_bin_grid to sample the grid and mimic integral
int
roi_bin_grid_h
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_height
/
pooled_height
);
// e.g., = 2
int
roi_bin_grid_w
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_width
/
pooled_width
);
// We do average (integral) pooling inside a bin
const
T
count
=
std
::
max
(
roi_bin_grid_h
*
roi_bin_grid_w
,
1
);
// e.g. = 4
// we want to precalculate indices and weights shared by all channels,
// this is the key point of optimization
std
::
vector
<
PreCalc
<
T
>>
pre_calc
(
roi_bin_grid_h
*
roi_bin_grid_w
*
pooled_width
*
pooled_height
);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
T
roi_start_h
=
-
roi_height
/
2.0
;
T
roi_start_w
=
-
roi_width
/
2.0
;
pre_calc_for_bilinear_interpolate
(
height
,
width
,
pooled_height
,
pooled_width
,
roi_bin_grid_h
,
roi_bin_grid_w
,
roi_start_h
,
roi_start_w
,
bin_size_h
,
bin_size_w
,
roi_bin_grid_h
,
roi_bin_grid_w
,
roi_center_h
,
roi_center_w
,
cos_theta
,
sin_theta
,
pre_calc
);
for
(
int
c
=
0
;
c
<
channels
;
c
++
)
{
int
index_n_c
=
index_n
+
c
*
pooled_width
*
pooled_height
;
const
T
*
offset_input
=
input
+
(
roi_batch_ind
*
channels
+
c
)
*
height
*
width
;
int
pre_calc_index
=
0
;
for
(
int
ph
=
0
;
ph
<
pooled_height
;
ph
++
)
{
for
(
int
pw
=
0
;
pw
<
pooled_width
;
pw
++
)
{
int
index
=
index_n_c
+
ph
*
pooled_width
+
pw
;
T
output_val
=
0.
;
for
(
int
iy
=
0
;
iy
<
roi_bin_grid_h
;
iy
++
)
{
for
(
int
ix
=
0
;
ix
<
roi_bin_grid_w
;
ix
++
)
{
PreCalc
<
T
>
pc
=
pre_calc
[
pre_calc_index
];
output_val
+=
pc
.
w1
*
offset_input
[
pc
.
pos1
]
+
pc
.
w2
*
offset_input
[
pc
.
pos2
]
+
pc
.
w3
*
offset_input
[
pc
.
pos3
]
+
pc
.
w4
*
offset_input
[
pc
.
pos4
];
pre_calc_index
+=
1
;
}
}
output_val
/=
count
;
output
[
index
]
=
output_val
;
}
// for pw
}
// for ph
}
// for c
}
// for n
}
template
<
typename
T
>
void
ROIAlignRotatedBackward
(
const
int
nthreads
,
// may not be contiguous. should index using n_stride, etc
const
T
*
grad_output
,
const
T
&
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
,
T
*
grad_input
,
const
T
*
rois
,
const
int
n_stride
,
const
int
c_stride
,
const
int
h_stride
,
const
int
w_stride
)
{
for
(
int
index
=
0
;
index
<
nthreads
;
index
++
)
{
// (n, c, ph, pw) is an element in the pooled output
int
pw
=
index
%
pooled_width
;
int
ph
=
(
index
/
pooled_width
)
%
pooled_height
;
int
c
=
(
index
/
pooled_width
/
pooled_height
)
%
channels
;
int
n
=
index
/
pooled_width
/
pooled_height
/
channels
;
const
T
*
current_roi
=
rois
+
n
*
6
;
int
roi_batch_ind
=
current_roi
[
0
];
// Do not use rounding; this implementation detail is critical
// ROIAlignRotated supports align == true, i.e., continuous coordinate
// by default, thus the 0.5 offset
T
offset
=
(
T
)
0.5
;
T
roi_center_w
=
current_roi
[
1
]
*
spatial_scale
-
offset
;
T
roi_center_h
=
current_roi
[
2
]
*
spatial_scale
-
offset
;
T
roi_width
=
current_roi
[
3
]
*
spatial_scale
;
T
roi_height
=
current_roi
[
4
]
*
spatial_scale
;
T
theta
=
current_roi
[
5
]
*
M_PI
/
180.0
;
T
cos_theta
=
cos
(
theta
);
T
sin_theta
=
sin
(
theta
);
AT_ASSERTM
(
roi_width
>=
0
&&
roi_height
>=
0
,
"ROIs in ROIAlignRotated do not have non-negative size!"
);
T
bin_size_h
=
static_cast
<
T
>
(
roi_height
)
/
static_cast
<
T
>
(
pooled_height
);
T
bin_size_w
=
static_cast
<
T
>
(
roi_width
)
/
static_cast
<
T
>
(
pooled_width
);
T
*
offset_grad_input
=
grad_input
+
((
roi_batch_ind
*
channels
+
c
)
*
height
*
width
);
int
output_offset
=
n
*
n_stride
+
c
*
c_stride
;
const
T
*
offset_grad_output
=
grad_output
+
output_offset
;
const
T
grad_output_this_bin
=
offset_grad_output
[
ph
*
h_stride
+
pw
*
w_stride
];
// We use roi_bin_grid to sample the grid and mimic integral
int
roi_bin_grid_h
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_height
/
pooled_height
);
// e.g., = 2
int
roi_bin_grid_w
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_width
/
pooled_width
);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
T
roi_start_h
=
-
roi_height
/
2.0
;
T
roi_start_w
=
-
roi_width
/
2.0
;
// We do average (integral) pooling inside a bin
const
T
count
=
roi_bin_grid_h
*
roi_bin_grid_w
;
// e.g. = 4
for
(
int
iy
=
0
;
iy
<
roi_bin_grid_h
;
iy
++
)
{
const
T
yy
=
roi_start_h
+
ph
*
bin_size_h
+
static_cast
<
T
>
(
iy
+
.5
f
)
*
bin_size_h
/
static_cast
<
T
>
(
roi_bin_grid_h
);
// e.g., 0.5, 1.5
for
(
int
ix
=
0
;
ix
<
roi_bin_grid_w
;
ix
++
)
{
const
T
xx
=
roi_start_w
+
pw
*
bin_size_w
+
static_cast
<
T
>
(
ix
+
.5
f
)
*
bin_size_w
/
static_cast
<
T
>
(
roi_bin_grid_w
);
// Rotate by theta around the center and translate
T
y
=
yy
*
cos_theta
-
xx
*
sin_theta
+
roi_center_h
;
T
x
=
yy
*
sin_theta
+
xx
*
cos_theta
+
roi_center_w
;
T
w1
,
w2
,
w3
,
w4
;
int
x_low
,
x_high
,
y_low
,
y_high
;
bilinear_interpolate_gradient
(
height
,
width
,
y
,
x
,
w1
,
w2
,
w3
,
w4
,
x_low
,
x_high
,
y_low
,
y_high
);
T
g1
=
grad_output_this_bin
*
w1
/
count
;
T
g2
=
grad_output_this_bin
*
w2
/
count
;
T
g3
=
grad_output_this_bin
*
w3
/
count
;
T
g4
=
grad_output_this_bin
*
w4
/
count
;
if
(
x_low
>=
0
&&
x_high
>=
0
&&
y_low
>=
0
&&
y_high
>=
0
)
{
// atomic add is not needed for now since it is single threaded
add
(
offset_grad_input
+
y_low
*
width
+
x_low
,
static_cast
<
T
>
(
g1
));
add
(
offset_grad_input
+
y_low
*
width
+
x_high
,
static_cast
<
T
>
(
g2
));
add
(
offset_grad_input
+
y_high
*
width
+
x_low
,
static_cast
<
T
>
(
g3
));
add
(
offset_grad_input
+
y_high
*
width
+
x_high
,
static_cast
<
T
>
(
g4
));
}
// if
}
// ix
}
// iy
}
// for
}
// ROIAlignRotatedBackward
at
::
Tensor
ROIAlignRotated_forward_cpu
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
)
{
AT_ASSERTM
(
input
.
device
().
is_cpu
(),
"input must be a CPU tensor"
);
AT_ASSERTM
(
rois
.
device
().
is_cpu
(),
"rois must be a CPU tensor"
);
at
::
TensorArg
input_t
{
input
,
"input"
,
1
},
rois_t
{
rois
,
"rois"
,
2
};
at
::
CheckedFrom
c
=
"ROIAlign_forward_cpu"
;
at
::
checkAllSameType
(
c
,
{
input_t
,
rois_t
});
auto
num_rois
=
rois
.
size
(
0
);
auto
channels
=
input
.
size
(
1
);
auto
height
=
input
.
size
(
2
);
auto
width
=
input
.
size
(
3
);
at
::
Tensor
output
=
at
::
zeros
(
{
num_rois
,
channels
,
pooled_height
,
pooled_width
},
input
.
options
());
auto
output_size
=
num_rois
*
pooled_height
*
pooled_width
*
channels
;
if
(
output
.
numel
()
==
0
)
{
return
output
;
}
auto
input_
=
input
.
contiguous
(),
rois_
=
rois
.
contiguous
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
input
.
scalar_type
(),
"ROIAlignRotated_forward"
,
[
&
]
{
ROIAlignRotatedForward
<
scalar_t
>
(
output_size
,
input_
.
data_ptr
<
scalar_t
>
(),
spatial_scale
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
,
sampling_ratio
,
rois_
.
data_ptr
<
scalar_t
>
(),
output
.
data_ptr
<
scalar_t
>
());
});
return
output
;
}
at
::
Tensor
ROIAlignRotated_backward_cpu
(
const
at
::
Tensor
&
grad
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
batch_size
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
sampling_ratio
)
{
AT_ASSERTM
(
grad
.
device
().
is_cpu
(),
"grad must be a CPU tensor"
);
AT_ASSERTM
(
rois
.
device
().
is_cpu
(),
"rois must be a CPU tensor"
);
at
::
TensorArg
grad_t
{
grad
,
"grad"
,
1
},
rois_t
{
rois
,
"rois"
,
2
};
at
::
CheckedFrom
c
=
"ROIAlignRotated_backward_cpu"
;
at
::
checkAllSameType
(
c
,
{
grad_t
,
rois_t
});
at
::
Tensor
grad_input
=
at
::
zeros
({
batch_size
,
channels
,
height
,
width
},
grad
.
options
());
// handle possibly empty gradients
if
(
grad
.
numel
()
==
0
)
{
return
grad_input
;
}
// get stride values to ensure indexing into gradients is correct.
int
n_stride
=
grad
.
stride
(
0
);
int
c_stride
=
grad
.
stride
(
1
);
int
h_stride
=
grad
.
stride
(
2
);
int
w_stride
=
grad
.
stride
(
3
);
auto
rois_
=
rois
.
contiguous
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
grad
.
scalar_type
(),
"ROIAlignRotated_forward"
,
[
&
]
{
ROIAlignRotatedBackward
<
scalar_t
>
(
grad
.
numel
(),
grad
.
data_ptr
<
scalar_t
>
(),
spatial_scale
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
,
sampling_ratio
,
grad_input
.
data_ptr
<
scalar_t
>
(),
rois_
.
data_ptr
<
scalar_t
>
(),
n_stride
,
c_stride
,
h_stride
,
w_stride
);
});
return
grad_input
;
}
}
// namespace detectron2
data_generation/grit/third_party/CenterNet2/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu
0 → 100644
View file @
b1e6136c
// Copyright (c) Facebook, Inc. and its affiliates.
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
// TODO make it in a common file
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
i += blockDim.x * gridDim.x)
// Note: this implementation originates from the Caffe2 ROIAlignRotated Op
// and PyTorch ROIAlign (non-rotated) Op implementations.
// The key difference between this implementation and those ones is
// we don't do "legacy offset" in this version, as there aren't many previous
// works, if any, using the "legacy" ROIAlignRotated Op.
// This would make the interface a bit cleaner.
namespace
detectron2
{
namespace
{
template
<
typename
T
>
__device__
T
bilinear_interpolate
(
const
T
*
input
,
const
int
height
,
const
int
width
,
T
y
,
T
x
)
{
// deal with cases that inverse elements are out of feature map boundary
if
(
y
<
-
1.0
||
y
>
height
||
x
<
-
1.0
||
x
>
width
)
{
// empty
return
0
;
}
if
(
y
<
0
)
{
y
=
0
;
}
if
(
x
<
0
)
{
x
=
0
;
}
int
y_low
=
(
int
)
y
;
int
x_low
=
(
int
)
x
;
int
y_high
;
int
x_high
;
if
(
y_low
>=
height
-
1
)
{
y_high
=
y_low
=
height
-
1
;
y
=
(
T
)
y_low
;
}
else
{
y_high
=
y_low
+
1
;
}
if
(
x_low
>=
width
-
1
)
{
x_high
=
x_low
=
width
-
1
;
x
=
(
T
)
x_low
;
}
else
{
x_high
=
x_low
+
1
;
}
T
ly
=
y
-
y_low
;
T
lx
=
x
-
x_low
;
T
hy
=
1.
-
ly
,
hx
=
1.
-
lx
;
// do bilinear interpolation
T
v1
=
input
[
y_low
*
width
+
x_low
];
T
v2
=
input
[
y_low
*
width
+
x_high
];
T
v3
=
input
[
y_high
*
width
+
x_low
];
T
v4
=
input
[
y_high
*
width
+
x_high
];
T
w1
=
hy
*
hx
,
w2
=
hy
*
lx
,
w3
=
ly
*
hx
,
w4
=
ly
*
lx
;
T
val
=
(
w1
*
v1
+
w2
*
v2
+
w3
*
v3
+
w4
*
v4
);
return
val
;
}
template
<
typename
T
>
__device__
void
bilinear_interpolate_gradient
(
const
int
height
,
const
int
width
,
T
y
,
T
x
,
T
&
w1
,
T
&
w2
,
T
&
w3
,
T
&
w4
,
int
&
x_low
,
int
&
x_high
,
int
&
y_low
,
int
&
y_high
)
{
// deal with cases that inverse elements are out of feature map boundary
if
(
y
<
-
1.0
||
y
>
height
||
x
<
-
1.0
||
x
>
width
)
{
// empty
w1
=
w2
=
w3
=
w4
=
0.
;
x_low
=
x_high
=
y_low
=
y_high
=
-
1
;
return
;
}
if
(
y
<
0
)
{
y
=
0
;
}
if
(
x
<
0
)
{
x
=
0
;
}
y_low
=
(
int
)
y
;
x_low
=
(
int
)
x
;
if
(
y_low
>=
height
-
1
)
{
y_high
=
y_low
=
height
-
1
;
y
=
(
T
)
y_low
;
}
else
{
y_high
=
y_low
+
1
;
}
if
(
x_low
>=
width
-
1
)
{
x_high
=
x_low
=
width
-
1
;
x
=
(
T
)
x_low
;
}
else
{
x_high
=
x_low
+
1
;
}
T
ly
=
y
-
y_low
;
T
lx
=
x
-
x_low
;
T
hy
=
1.
-
ly
,
hx
=
1.
-
lx
;
// reference in forward
// T v1 = input[y_low * width + x_low];
// T v2 = input[y_low * width + x_high];
// T v3 = input[y_high * width + x_low];
// T v4 = input[y_high * width + x_high];
// T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
w1
=
hy
*
hx
,
w2
=
hy
*
lx
,
w3
=
ly
*
hx
,
w4
=
ly
*
lx
;
return
;
}
}
// namespace
template
<
typename
T
>
__global__
void
RoIAlignRotatedForward
(
const
int
nthreads
,
const
T
*
input
,
const
T
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
,
const
T
*
rois
,
T
*
top_data
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
// (n, c, ph, pw) is an element in the pooled output
int
pw
=
index
%
pooled_width
;
int
ph
=
(
index
/
pooled_width
)
%
pooled_height
;
int
c
=
(
index
/
pooled_width
/
pooled_height
)
%
channels
;
int
n
=
index
/
pooled_width
/
pooled_height
/
channels
;
const
T
*
current_roi
=
rois
+
n
*
6
;
int
roi_batch_ind
=
current_roi
[
0
];
// Do not use rounding; this implementation detail is critical
// ROIAlignRotated supports align == true, i.e., continuous coordinate
// by default, thus the 0.5 offset
T
offset
=
(
T
)
0.5
;
T
roi_center_w
=
current_roi
[
1
]
*
spatial_scale
-
offset
;
T
roi_center_h
=
current_roi
[
2
]
*
spatial_scale
-
offset
;
T
roi_width
=
current_roi
[
3
]
*
spatial_scale
;
T
roi_height
=
current_roi
[
4
]
*
spatial_scale
;
T
theta
=
current_roi
[
5
]
*
M_PI
/
180.0
;
T
cos_theta
=
cos
(
theta
);
T
sin_theta
=
sin
(
theta
);
T
bin_size_h
=
static_cast
<
T
>
(
roi_height
)
/
static_cast
<
T
>
(
pooled_height
);
T
bin_size_w
=
static_cast
<
T
>
(
roi_width
)
/
static_cast
<
T
>
(
pooled_width
);
const
T
*
offset_input
=
input
+
(
roi_batch_ind
*
channels
+
c
)
*
height
*
width
;
// We use roi_bin_grid to sample the grid and mimic integral
int
roi_bin_grid_h
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_height
/
pooled_height
);
// e.g., = 2
int
roi_bin_grid_w
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_width
/
pooled_width
);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
T
roi_start_h
=
-
roi_height
/
2.0
;
T
roi_start_w
=
-
roi_width
/
2.0
;
// We do average (inte gral) pooling inside a bin
const
T
count
=
max
(
roi_bin_grid_h
*
roi_bin_grid_w
,
1
);
// e.g. = 4
T
output_val
=
0.
;
for
(
int
iy
=
0
;
iy
<
roi_bin_grid_h
;
iy
++
)
// e.g., iy = 0, 1
{
const
T
yy
=
roi_start_h
+
ph
*
bin_size_h
+
static_cast
<
T
>
(
iy
+
.5
f
)
*
bin_size_h
/
static_cast
<
T
>
(
roi_bin_grid_h
);
// e.g., 0.5, 1.5
for
(
int
ix
=
0
;
ix
<
roi_bin_grid_w
;
ix
++
)
{
const
T
xx
=
roi_start_w
+
pw
*
bin_size_w
+
static_cast
<
T
>
(
ix
+
.5
f
)
*
bin_size_w
/
static_cast
<
T
>
(
roi_bin_grid_w
);
// Rotate by theta around the center and translate
T
y
=
yy
*
cos_theta
-
xx
*
sin_theta
+
roi_center_h
;
T
x
=
yy
*
sin_theta
+
xx
*
cos_theta
+
roi_center_w
;
T
val
=
bilinear_interpolate
(
offset_input
,
height
,
width
,
y
,
x
);
output_val
+=
val
;
}
}
output_val
/=
count
;
top_data
[
index
]
=
output_val
;
}
}
template
<
typename
T
>
__global__
void
RoIAlignRotatedBackwardFeature
(
const
int
nthreads
,
const
T
*
top_diff
,
const
int
num_rois
,
const
T
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
,
T
*
bottom_diff
,
const
T
*
rois
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
// (n, c, ph, pw) is an element in the pooled output
int
pw
=
index
%
pooled_width
;
int
ph
=
(
index
/
pooled_width
)
%
pooled_height
;
int
c
=
(
index
/
pooled_width
/
pooled_height
)
%
channels
;
int
n
=
index
/
pooled_width
/
pooled_height
/
channels
;
const
T
*
current_roi
=
rois
+
n
*
6
;
int
roi_batch_ind
=
current_roi
[
0
];
// Do not use rounding; this implementation detail is critical
// ROIAlignRotated supports align == true, i.e., continuous coordinate
// by default, thus the 0.5 offset
T
offset
=
(
T
)
0.5
;
T
roi_center_w
=
current_roi
[
1
]
*
spatial_scale
-
offset
;
T
roi_center_h
=
current_roi
[
2
]
*
spatial_scale
-
offset
;
T
roi_width
=
current_roi
[
3
]
*
spatial_scale
;
T
roi_height
=
current_roi
[
4
]
*
spatial_scale
;
T
theta
=
current_roi
[
5
]
*
M_PI
/
180.0
;
T
cos_theta
=
cos
(
theta
);
T
sin_theta
=
sin
(
theta
);
T
bin_size_h
=
static_cast
<
T
>
(
roi_height
)
/
static_cast
<
T
>
(
pooled_height
);
T
bin_size_w
=
static_cast
<
T
>
(
roi_width
)
/
static_cast
<
T
>
(
pooled_width
);
T
*
offset_bottom_diff
=
bottom_diff
+
(
roi_batch_ind
*
channels
+
c
)
*
height
*
width
;
int
top_offset
=
(
n
*
channels
+
c
)
*
pooled_height
*
pooled_width
;
const
T
*
offset_top_diff
=
top_diff
+
top_offset
;
const
T
top_diff_this_bin
=
offset_top_diff
[
ph
*
pooled_width
+
pw
];
// We use roi_bin_grid to sample the grid and mimic integral
int
roi_bin_grid_h
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_height
/
pooled_height
);
// e.g., = 2
int
roi_bin_grid_w
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_width
/
pooled_width
);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
T
roi_start_h
=
-
roi_height
/
2.0
;
T
roi_start_w
=
-
roi_width
/
2.0
;
// We do average (integral) pooling inside a bin
const
T
count
=
roi_bin_grid_h
*
roi_bin_grid_w
;
// e.g. = 4
for
(
int
iy
=
0
;
iy
<
roi_bin_grid_h
;
iy
++
)
// e.g., iy = 0, 1
{
const
T
yy
=
roi_start_h
+
ph
*
bin_size_h
+
static_cast
<
T
>
(
iy
+
.5
f
)
*
bin_size_h
/
static_cast
<
T
>
(
roi_bin_grid_h
);
// e.g., 0.5, 1.5
for
(
int
ix
=
0
;
ix
<
roi_bin_grid_w
;
ix
++
)
{
const
T
xx
=
roi_start_w
+
pw
*
bin_size_w
+
static_cast
<
T
>
(
ix
+
.5
f
)
*
bin_size_w
/
static_cast
<
T
>
(
roi_bin_grid_w
);
// Rotate by theta around the center and translate
T
y
=
yy
*
cos_theta
-
xx
*
sin_theta
+
roi_center_h
;
T
x
=
yy
*
sin_theta
+
xx
*
cos_theta
+
roi_center_w
;
T
w1
,
w2
,
w3
,
w4
;
int
x_low
,
x_high
,
y_low
,
y_high
;
bilinear_interpolate_gradient
(
height
,
width
,
y
,
x
,
w1
,
w2
,
w3
,
w4
,
x_low
,
x_high
,
y_low
,
y_high
);
T
g1
=
top_diff_this_bin
*
w1
/
count
;
T
g2
=
top_diff_this_bin
*
w2
/
count
;
T
g3
=
top_diff_this_bin
*
w3
/
count
;
T
g4
=
top_diff_this_bin
*
w4
/
count
;
if
(
x_low
>=
0
&&
x_high
>=
0
&&
y_low
>=
0
&&
y_high
>=
0
)
{
atomicAdd
(
offset_bottom_diff
+
y_low
*
width
+
x_low
,
static_cast
<
T
>
(
g1
));
atomicAdd
(
offset_bottom_diff
+
y_low
*
width
+
x_high
,
static_cast
<
T
>
(
g2
));
atomicAdd
(
offset_bottom_diff
+
y_high
*
width
+
x_low
,
static_cast
<
T
>
(
g3
));
atomicAdd
(
offset_bottom_diff
+
y_high
*
width
+
x_high
,
static_cast
<
T
>
(
g4
));
}
// if
}
// ix
}
// iy
}
// CUDA_1D_KERNEL_LOOP
}
// RoIAlignRotatedBackward
at
::
Tensor
ROIAlignRotated_forward_cuda
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
)
{
AT_ASSERTM
(
input
.
device
().
is_cuda
(),
"input must be a CUDA tensor"
);
AT_ASSERTM
(
rois
.
device
().
is_cuda
(),
"rois must be a CUDA tensor"
);
at
::
TensorArg
input_t
{
input
,
"input"
,
1
},
rois_t
{
rois
,
"rois"
,
2
};
at
::
CheckedFrom
c
=
"ROIAlignRotated_forward_cuda"
;
at
::
checkAllSameGPU
(
c
,
{
input_t
,
rois_t
});
at
::
checkAllSameType
(
c
,
{
input_t
,
rois_t
});
at
::
cuda
::
CUDAGuard
device_guard
(
input
.
device
());
auto
num_rois
=
rois
.
size
(
0
);
auto
channels
=
input
.
size
(
1
);
auto
height
=
input
.
size
(
2
);
auto
width
=
input
.
size
(
3
);
auto
output
=
at
::
empty
(
{
num_rois
,
channels
,
pooled_height
,
pooled_width
},
input
.
options
());
auto
output_size
=
num_rois
*
pooled_height
*
pooled_width
*
channels
;
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
dim3
grid
(
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
static_cast
<
int64_t
>
(
output_size
),
static_cast
<
int64_t
>
(
512
)),
static_cast
<
int64_t
>
(
4096
)));
dim3
block
(
512
);
if
(
output
.
numel
()
==
0
)
{
AT_CUDA_CHECK
(
cudaGetLastError
());
return
output
;
}
auto
input_
=
input
.
contiguous
(),
rois_
=
rois
.
contiguous
();
AT_DISPATCH_FLOATING_TYPES
(
input
.
scalar_type
(),
"ROIAlignRotated_forward"
,
[
&
]
{
RoIAlignRotatedForward
<
scalar_t
><<<
grid
,
block
,
0
,
stream
>>>
(
output_size
,
input_
.
data_ptr
<
scalar_t
>
(),
spatial_scale
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
,
sampling_ratio
,
rois_
.
data_ptr
<
scalar_t
>
(),
output
.
data_ptr
<
scalar_t
>
());
});
cudaDeviceSynchronize
();
AT_CUDA_CHECK
(
cudaGetLastError
());
return
output
;
}
// TODO remove the dependency on input and use instead its sizes -> save memory
at
::
Tensor
ROIAlignRotated_backward_cuda
(
const
at
::
Tensor
&
grad
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
batch_size
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
sampling_ratio
)
{
AT_ASSERTM
(
grad
.
device
().
is_cuda
(),
"grad must be a CUDA tensor"
);
AT_ASSERTM
(
rois
.
device
().
is_cuda
(),
"rois must be a CUDA tensor"
);
at
::
TensorArg
grad_t
{
grad
,
"grad"
,
1
},
rois_t
{
rois
,
"rois"
,
2
};
at
::
CheckedFrom
c
=
"ROIAlign_backward_cuda"
;
at
::
checkAllSameGPU
(
c
,
{
grad_t
,
rois_t
});
at
::
checkAllSameType
(
c
,
{
grad_t
,
rois_t
});
at
::
cuda
::
CUDAGuard
device_guard
(
grad
.
device
());
auto
num_rois
=
rois
.
size
(
0
);
auto
grad_input
=
at
::
zeros
({
batch_size
,
channels
,
height
,
width
},
grad
.
options
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
dim3
grid
(
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
static_cast
<
int64_t
>
(
grad
.
numel
()),
static_cast
<
int64_t
>
(
512
)),
static_cast
<
int64_t
>
(
4096
)));
dim3
block
(
512
);
// handle possibly empty gradients
if
(
grad
.
numel
()
==
0
)
{
AT_CUDA_CHECK
(
cudaGetLastError
());
return
grad_input
;
}
auto
grad_
=
grad
.
contiguous
(),
rois_
=
rois
.
contiguous
();
AT_DISPATCH_FLOATING_TYPES
(
grad
.
scalar_type
(),
"ROIAlignRotated_backward"
,
[
&
]
{
RoIAlignRotatedBackwardFeature
<
scalar_t
><<<
grid
,
block
,
0
,
stream
>>>
(
grad
.
numel
(),
grad_
.
data_ptr
<
scalar_t
>
(),
num_rois
,
spatial_scale
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
,
sampling_ratio
,
grad_input
.
data_ptr
<
scalar_t
>
(),
rois_
.
data_ptr
<
scalar_t
>
());
});
AT_CUDA_CHECK
(
cudaGetLastError
());
return
grad_input
;
}
}
// namespace detectron2
data_generation/grit/third_party/CenterNet2/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
0 → 100644
View file @
b1e6136c
// Copyright (c) Facebook, Inc. and its affiliates.
#pragma once
#include <torch/types.h>
namespace
detectron2
{
at
::
Tensor
box_iou_rotated_cpu
(
const
at
::
Tensor
&
boxes1
,
const
at
::
Tensor
&
boxes2
);
#if defined(WITH_CUDA) || defined(WITH_HIP)
at
::
Tensor
box_iou_rotated_cuda
(
const
at
::
Tensor
&
boxes1
,
const
at
::
Tensor
&
boxes2
);
#endif
// Interface for Python
// inline is needed to prevent multiple function definitions when this header is
// included by different cpps
inline
at
::
Tensor
box_iou_rotated
(
const
at
::
Tensor
&
boxes1
,
const
at
::
Tensor
&
boxes2
)
{
assert
(
boxes1
.
device
().
is_cuda
()
==
boxes2
.
device
().
is_cuda
());
if
(
boxes1
.
device
().
is_cuda
())
{
#if defined(WITH_CUDA) || defined(WITH_HIP)
return
box_iou_rotated_cuda
(
boxes1
.
contiguous
(),
boxes2
.
contiguous
());
#else
AT_ERROR
(
"Detectron2 is not compiled with GPU support!"
);
#endif
}
return
box_iou_rotated_cpu
(
boxes1
.
contiguous
(),
boxes2
.
contiguous
());
}
}
// namespace detectron2
data_generation/grit/third_party/CenterNet2/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
0 → 100644
View file @
b1e6136c
// Copyright (c) Facebook, Inc. and its affiliates.
#include "box_iou_rotated.h"
#include "box_iou_rotated_utils.h"
namespace
detectron2
{
template
<
typename
T
>
void
box_iou_rotated_cpu_kernel
(
const
at
::
Tensor
&
boxes1
,
const
at
::
Tensor
&
boxes2
,
at
::
Tensor
&
ious
)
{
auto
num_boxes1
=
boxes1
.
size
(
0
);
auto
num_boxes2
=
boxes2
.
size
(
0
);
for
(
int
i
=
0
;
i
<
num_boxes1
;
i
++
)
{
for
(
int
j
=
0
;
j
<
num_boxes2
;
j
++
)
{
ious
[
i
*
num_boxes2
+
j
]
=
single_box_iou_rotated
<
T
>
(
boxes1
[
i
].
data_ptr
<
T
>
(),
boxes2
[
j
].
data_ptr
<
T
>
());
}
}
}
at
::
Tensor
box_iou_rotated_cpu
(
// input must be contiguous:
const
at
::
Tensor
&
boxes1
,
const
at
::
Tensor
&
boxes2
)
{
auto
num_boxes1
=
boxes1
.
size
(
0
);
auto
num_boxes2
=
boxes2
.
size
(
0
);
at
::
Tensor
ious
=
at
::
empty
({
num_boxes1
*
num_boxes2
},
boxes1
.
options
().
dtype
(
at
::
kFloat
));
box_iou_rotated_cpu_kernel
<
float
>
(
boxes1
,
boxes2
,
ious
);
// reshape from 1d array to 2d array
auto
shape
=
std
::
vector
<
int64_t
>
{
num_boxes1
,
num_boxes2
};
return
ious
.
reshape
(
shape
);
}
}
// namespace detectron2
data_generation/grit/third_party/CenterNet2/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
0 → 100644
View file @
b1e6136c
// Copyright (c) Facebook, Inc. and its affiliates.
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
#include "box_iou_rotated_utils.h"
namespace
detectron2
{
// 2D block with 32 * 16 = 512 threads per block
const
int
BLOCK_DIM_X
=
32
;
const
int
BLOCK_DIM_Y
=
16
;
template
<
typename
T
>
__global__
void
box_iou_rotated_cuda_kernel
(
const
int
n_boxes1
,
const
int
n_boxes2
,
const
T
*
dev_boxes1
,
const
T
*
dev_boxes2
,
T
*
dev_ious
)
{
const
int
row_start
=
blockIdx
.
x
*
blockDim
.
x
;
const
int
col_start
=
blockIdx
.
y
*
blockDim
.
y
;
const
int
row_size
=
min
(
n_boxes1
-
row_start
,
blockDim
.
x
);
const
int
col_size
=
min
(
n_boxes2
-
col_start
,
blockDim
.
y
);
__shared__
float
block_boxes1
[
BLOCK_DIM_X
*
5
];
__shared__
float
block_boxes2
[
BLOCK_DIM_Y
*
5
];
// It's safe to copy using threadIdx.x since BLOCK_DIM_X >= BLOCK_DIM_Y
if
(
threadIdx
.
x
<
row_size
&&
threadIdx
.
y
==
0
)
{
block_boxes1
[
threadIdx
.
x
*
5
+
0
]
=
dev_boxes1
[(
row_start
+
threadIdx
.
x
)
*
5
+
0
];
block_boxes1
[
threadIdx
.
x
*
5
+
1
]
=
dev_boxes1
[(
row_start
+
threadIdx
.
x
)
*
5
+
1
];
block_boxes1
[
threadIdx
.
x
*
5
+
2
]
=
dev_boxes1
[(
row_start
+
threadIdx
.
x
)
*
5
+
2
];
block_boxes1
[
threadIdx
.
x
*
5
+
3
]
=
dev_boxes1
[(
row_start
+
threadIdx
.
x
)
*
5
+
3
];
block_boxes1
[
threadIdx
.
x
*
5
+
4
]
=
dev_boxes1
[(
row_start
+
threadIdx
.
x
)
*
5
+
4
];
}
if
(
threadIdx
.
x
<
col_size
&&
threadIdx
.
y
==
0
)
{
block_boxes2
[
threadIdx
.
x
*
5
+
0
]
=
dev_boxes2
[(
col_start
+
threadIdx
.
x
)
*
5
+
0
];
block_boxes2
[
threadIdx
.
x
*
5
+
1
]
=
dev_boxes2
[(
col_start
+
threadIdx
.
x
)
*
5
+
1
];
block_boxes2
[
threadIdx
.
x
*
5
+
2
]
=
dev_boxes2
[(
col_start
+
threadIdx
.
x
)
*
5
+
2
];
block_boxes2
[
threadIdx
.
x
*
5
+
3
]
=
dev_boxes2
[(
col_start
+
threadIdx
.
x
)
*
5
+
3
];
block_boxes2
[
threadIdx
.
x
*
5
+
4
]
=
dev_boxes2
[(
col_start
+
threadIdx
.
x
)
*
5
+
4
];
}
__syncthreads
();
if
(
threadIdx
.
x
<
row_size
&&
threadIdx
.
y
<
col_size
)
{
int
offset
=
(
row_start
+
threadIdx
.
x
)
*
n_boxes2
+
col_start
+
threadIdx
.
y
;
dev_ious
[
offset
]
=
single_box_iou_rotated
<
T
>
(
block_boxes1
+
threadIdx
.
x
*
5
,
block_boxes2
+
threadIdx
.
y
*
5
);
}
}
at
::
Tensor
box_iou_rotated_cuda
(
// input must be contiguous
const
at
::
Tensor
&
boxes1
,
const
at
::
Tensor
&
boxes2
)
{
using
scalar_t
=
float
;
AT_ASSERTM
(
boxes1
.
scalar_type
()
==
at
::
kFloat
,
"boxes1 must be a float tensor"
);
AT_ASSERTM
(
boxes2
.
scalar_type
()
==
at
::
kFloat
,
"boxes2 must be a float tensor"
);
AT_ASSERTM
(
boxes1
.
is_cuda
(),
"boxes1 must be a CUDA tensor"
);
AT_ASSERTM
(
boxes2
.
is_cuda
(),
"boxes2 must be a CUDA tensor"
);
at
::
cuda
::
CUDAGuard
device_guard
(
boxes1
.
device
());
auto
num_boxes1
=
boxes1
.
size
(
0
);
auto
num_boxes2
=
boxes2
.
size
(
0
);
at
::
Tensor
ious
=
at
::
empty
({
num_boxes1
*
num_boxes2
},
boxes1
.
options
().
dtype
(
at
::
kFloat
));
bool
transpose
=
false
;
if
(
num_boxes1
>
0
&&
num_boxes2
>
0
)
{
scalar_t
*
data1
=
boxes1
.
data_ptr
<
scalar_t
>
(),
*
data2
=
boxes2
.
data_ptr
<
scalar_t
>
();
if
(
num_boxes2
>
65535
*
BLOCK_DIM_Y
)
{
AT_ASSERTM
(
num_boxes1
<=
65535
*
BLOCK_DIM_Y
,
"Too many boxes for box_iou_rotated_cuda!"
);
// x dim is allowed to be large, but y dim cannot,
// so we transpose the two to avoid "invalid configuration argument"
// error. We assume one of them is small. Otherwise the result is hard to
// fit in memory anyway.
std
::
swap
(
num_boxes1
,
num_boxes2
);
std
::
swap
(
data1
,
data2
);
transpose
=
true
;
}
const
int
blocks_x
=
at
::
cuda
::
ATenCeilDiv
(
static_cast
<
int
>
(
num_boxes1
),
BLOCK_DIM_X
);
const
int
blocks_y
=
at
::
cuda
::
ATenCeilDiv
(
static_cast
<
int
>
(
num_boxes2
),
BLOCK_DIM_Y
);
dim3
blocks
(
blocks_x
,
blocks_y
);
dim3
threads
(
BLOCK_DIM_X
,
BLOCK_DIM_Y
);
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
box_iou_rotated_cuda_kernel
<
scalar_t
><<<
blocks
,
threads
,
0
,
stream
>>>
(
num_boxes1
,
num_boxes2
,
data1
,
data2
,
(
scalar_t
*
)
ious
.
data_ptr
<
scalar_t
>
());
AT_CUDA_CHECK
(
cudaGetLastError
());
}
// reshape from 1d array to 2d array
auto
shape
=
std
::
vector
<
int64_t
>
{
num_boxes1
,
num_boxes2
};
if
(
transpose
)
{
return
ious
.
view
(
shape
).
t
();
}
else
{
return
ious
.
view
(
shape
);
}
}
}
// namespace detectron2
data_generation/grit/third_party/CenterNet2/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
0 → 100644
View file @
b1e6136c
// Copyright (c) Facebook, Inc. and its affiliates.
#pragma once
#include <cassert>
#include <cmath>
#if defined(__CUDACC__) || __HCC__ == 1 || __HIP__ == 1
// Designates functions callable from the host (CPU) and the device (GPU)
#define HOST_DEVICE __host__ __device__
#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__
#else
#include <algorithm>
#define HOST_DEVICE
#define HOST_DEVICE_INLINE HOST_DEVICE inline
#endif
namespace
detectron2
{
namespace
{
template
<
typename
T
>
struct
RotatedBox
{
T
x_ctr
,
y_ctr
,
w
,
h
,
a
;
};
template
<
typename
T
>
struct
Point
{
T
x
,
y
;
HOST_DEVICE_INLINE
Point
(
const
T
&
px
=
0
,
const
T
&
py
=
0
)
:
x
(
px
),
y
(
py
)
{}
HOST_DEVICE_INLINE
Point
operator
+
(
const
Point
&
p
)
const
{
return
Point
(
x
+
p
.
x
,
y
+
p
.
y
);
}
HOST_DEVICE_INLINE
Point
&
operator
+=
(
const
Point
&
p
)
{
x
+=
p
.
x
;
y
+=
p
.
y
;
return
*
this
;
}
HOST_DEVICE_INLINE
Point
operator
-
(
const
Point
&
p
)
const
{
return
Point
(
x
-
p
.
x
,
y
-
p
.
y
);
}
HOST_DEVICE_INLINE
Point
operator
*
(
const
T
coeff
)
const
{
return
Point
(
x
*
coeff
,
y
*
coeff
);
}
};
template
<
typename
T
>
HOST_DEVICE_INLINE
T
dot_2d
(
const
Point
<
T
>&
A
,
const
Point
<
T
>&
B
)
{
return
A
.
x
*
B
.
x
+
A
.
y
*
B
.
y
;
}
// R: result type. can be different from input type
template
<
typename
T
,
typename
R
=
T
>
HOST_DEVICE_INLINE
R
cross_2d
(
const
Point
<
T
>&
A
,
const
Point
<
T
>&
B
)
{
return
static_cast
<
R
>
(
A
.
x
)
*
static_cast
<
R
>
(
B
.
y
)
-
static_cast
<
R
>
(
B
.
x
)
*
static_cast
<
R
>
(
A
.
y
);
}
template
<
typename
T
>
HOST_DEVICE_INLINE
void
get_rotated_vertices
(
const
RotatedBox
<
T
>&
box
,
Point
<
T
>
(
&
pts
)[
4
])
{
// M_PI / 180. == 0.01745329251
double
theta
=
box
.
a
*
0.01745329251
;
T
cosTheta2
=
(
T
)
cos
(
theta
)
*
0.5
f
;
T
sinTheta2
=
(
T
)
sin
(
theta
)
*
0.5
f
;
// y: top --> down; x: left --> right
pts
[
0
].
x
=
box
.
x_ctr
+
sinTheta2
*
box
.
h
+
cosTheta2
*
box
.
w
;
pts
[
0
].
y
=
box
.
y_ctr
+
cosTheta2
*
box
.
h
-
sinTheta2
*
box
.
w
;
pts
[
1
].
x
=
box
.
x_ctr
-
sinTheta2
*
box
.
h
+
cosTheta2
*
box
.
w
;
pts
[
1
].
y
=
box
.
y_ctr
-
cosTheta2
*
box
.
h
-
sinTheta2
*
box
.
w
;
pts
[
2
].
x
=
2
*
box
.
x_ctr
-
pts
[
0
].
x
;
pts
[
2
].
y
=
2
*
box
.
y_ctr
-
pts
[
0
].
y
;
pts
[
3
].
x
=
2
*
box
.
x_ctr
-
pts
[
1
].
x
;
pts
[
3
].
y
=
2
*
box
.
y_ctr
-
pts
[
1
].
y
;
}
template
<
typename
T
>
HOST_DEVICE_INLINE
int
get_intersection_points
(
const
Point
<
T
>
(
&
pts1
)[
4
],
const
Point
<
T
>
(
&
pts2
)[
4
],
Point
<
T
>
(
&
intersections
)[
24
])
{
// Line vector
// A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
Point
<
T
>
vec1
[
4
],
vec2
[
4
];
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
vec1
[
i
]
=
pts1
[(
i
+
1
)
%
4
]
-
pts1
[
i
];
vec2
[
i
]
=
pts2
[(
i
+
1
)
%
4
]
-
pts2
[
i
];
}
// When computing the intersection area, it doesn't hurt if we have
// more (duplicated/approximate) intersections/vertices than needed,
// while it can cause drastic difference if we miss an intersection/vertex.
// Therefore, we add an epsilon to relax the comparisons between
// the float point numbers that decide the intersection points.
double
EPS
=
1e-5
;
// Line test - test all line combos for intersection
int
num
=
0
;
// number of intersections
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
// Solve for 2x2 Ax=b
T
det
=
cross_2d
<
T
>
(
vec2
[
j
],
vec1
[
i
]);
// This takes care of parallel lines
if
(
fabs
(
det
)
<=
1e-14
)
{
continue
;
}
auto
vec12
=
pts2
[
j
]
-
pts1
[
i
];
T
t1
=
cross_2d
<
T
>
(
vec2
[
j
],
vec12
)
/
det
;
T
t2
=
cross_2d
<
T
>
(
vec1
[
i
],
vec12
)
/
det
;
if
(
t1
>
-
EPS
&&
t1
<
1.0
f
+
EPS
&&
t2
>
-
EPS
&&
t2
<
1.0
f
+
EPS
)
{
intersections
[
num
++
]
=
pts1
[
i
]
+
vec1
[
i
]
*
t1
;
}
}
}
// Check for vertices of rect1 inside rect2
{
const
auto
&
AB
=
vec2
[
0
];
const
auto
&
DA
=
vec2
[
3
];
auto
ABdotAB
=
dot_2d
<
T
>
(
AB
,
AB
);
auto
ADdotAD
=
dot_2d
<
T
>
(
DA
,
DA
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
// assume ABCD is the rectangle, and P is the point to be judged
// P is inside ABCD iff. P's projection on AB lies within AB
// and P's projection on AD lies within AD
auto
AP
=
pts1
[
i
]
-
pts2
[
0
];
auto
APdotAB
=
dot_2d
<
T
>
(
AP
,
AB
);
auto
APdotAD
=
-
dot_2d
<
T
>
(
AP
,
DA
);
if
((
APdotAB
>
-
EPS
)
&&
(
APdotAD
>
-
EPS
)
&&
(
APdotAB
<
ABdotAB
+
EPS
)
&&
(
APdotAD
<
ADdotAD
+
EPS
))
{
intersections
[
num
++
]
=
pts1
[
i
];
}
}
}
// Reverse the check - check for vertices of rect2 inside rect1
{
const
auto
&
AB
=
vec1
[
0
];
const
auto
&
DA
=
vec1
[
3
];
auto
ABdotAB
=
dot_2d
<
T
>
(
AB
,
AB
);
auto
ADdotAD
=
dot_2d
<
T
>
(
DA
,
DA
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
auto
AP
=
pts2
[
i
]
-
pts1
[
0
];
auto
APdotAB
=
dot_2d
<
T
>
(
AP
,
AB
);
auto
APdotAD
=
-
dot_2d
<
T
>
(
AP
,
DA
);
if
((
APdotAB
>
-
EPS
)
&&
(
APdotAD
>
-
EPS
)
&&
(
APdotAB
<
ABdotAB
+
EPS
)
&&
(
APdotAD
<
ADdotAD
+
EPS
))
{
intersections
[
num
++
]
=
pts2
[
i
];
}
}
}
return
num
;
}
template
<
typename
T
>
HOST_DEVICE_INLINE
int
convex_hull_graham
(
const
Point
<
T
>
(
&
p
)[
24
],
const
int
&
num_in
,
Point
<
T
>
(
&
q
)[
24
],
bool
shift_to_zero
=
false
)
{
assert
(
num_in
>=
2
);
// Step 1:
// Find point with minimum y
// if more than 1 points have the same minimum y,
// pick the one with the minimum x.
int
t
=
0
;
for
(
int
i
=
1
;
i
<
num_in
;
i
++
)
{
if
(
p
[
i
].
y
<
p
[
t
].
y
||
(
p
[
i
].
y
==
p
[
t
].
y
&&
p
[
i
].
x
<
p
[
t
].
x
))
{
t
=
i
;
}
}
auto
&
start
=
p
[
t
];
// starting point
// Step 2:
// Subtract starting point from every points (for sorting in the next step)
for
(
int
i
=
0
;
i
<
num_in
;
i
++
)
{
q
[
i
]
=
p
[
i
]
-
start
;
}
// Swap the starting point to position 0
auto
tmp
=
q
[
0
];
q
[
0
]
=
q
[
t
];
q
[
t
]
=
tmp
;
// Step 3:
// Sort point 1 ~ num_in according to their relative cross-product values
// (essentially sorting according to angles)
// If the angles are the same, sort according to their distance to origin
T
dist
[
24
];
#if defined(__CUDACC__) || __HCC__ == 1 || __HIP__ == 1
// compute distance to origin before sort, and sort them together with the
// points
for
(
int
i
=
0
;
i
<
num_in
;
i
++
)
{
dist
[
i
]
=
dot_2d
<
T
>
(
q
[
i
],
q
[
i
]);
}
// CUDA version
// In the future, we can potentially use thrust
// for sorting here to improve speed (though not guaranteed)
for
(
int
i
=
1
;
i
<
num_in
-
1
;
i
++
)
{
for
(
int
j
=
i
+
1
;
j
<
num_in
;
j
++
)
{
T
crossProduct
=
cross_2d
<
T
>
(
q
[
i
],
q
[
j
]);
if
((
crossProduct
<
-
1e-6
)
||
(
fabs
(
crossProduct
)
<
1e-6
&&
dist
[
i
]
>
dist
[
j
]))
{
auto
q_tmp
=
q
[
i
];
q
[
i
]
=
q
[
j
];
q
[
j
]
=
q_tmp
;
auto
dist_tmp
=
dist
[
i
];
dist
[
i
]
=
dist
[
j
];
dist
[
j
]
=
dist_tmp
;
}
}
}
#else
// CPU version
std
::
sort
(
q
+
1
,
q
+
num_in
,
[](
const
Point
<
T
>&
A
,
const
Point
<
T
>&
B
)
->
bool
{
T
temp
=
cross_2d
<
T
>
(
A
,
B
);
if
(
fabs
(
temp
)
<
1e-6
)
{
return
dot_2d
<
T
>
(
A
,
A
)
<
dot_2d
<
T
>
(
B
,
B
);
}
else
{
return
temp
>
0
;
}
});
// compute distance to origin after sort, since the points are now different.
for
(
int
i
=
0
;
i
<
num_in
;
i
++
)
{
dist
[
i
]
=
dot_2d
<
T
>
(
q
[
i
],
q
[
i
]);
}
#endif
// Step 4:
// Make sure there are at least 2 points (that don't overlap with each other)
// in the stack
int
k
;
// index of the non-overlapped second point
for
(
k
=
1
;
k
<
num_in
;
k
++
)
{
if
(
dist
[
k
]
>
1e-8
)
{
break
;
}
}
if
(
k
==
num_in
)
{
// We reach the end, which means the convex hull is just one point
q
[
0
]
=
p
[
t
];
return
1
;
}
q
[
1
]
=
q
[
k
];
int
m
=
2
;
// 2 points in the stack
// Step 5:
// Finally we can start the scanning process.
// When a non-convex relationship between the 3 points is found
// (either concave shape or duplicated points),
// we pop the previous point from the stack
// until the 3-point relationship is convex again, or
// until the stack only contains two points
for
(
int
i
=
k
+
1
;
i
<
num_in
;
i
++
)
{
while
(
m
>
1
)
{
auto
q1
=
q
[
i
]
-
q
[
m
-
2
],
q2
=
q
[
m
-
1
]
-
q
[
m
-
2
];
// cross_2d() uses FMA and therefore computes round(round(q1.x*q2.y) -
// q2.x*q1.y) So it may not return 0 even when q1==q2. Therefore we
// compare round(q1.x*q2.y) and round(q2.x*q1.y) directly. (round means
// round to nearest floating point).
if
(
q1
.
x
*
q2
.
y
>=
q2
.
x
*
q1
.
y
)
m
--
;
else
break
;
}
// Using double also helps, but float can solve the issue for now.
// while (m > 1 && cross_2d<T, double>(q[i] - q[m - 2], q[m - 1] - q[m - 2])
// >= 0) {
// m--;
// }
q
[
m
++
]
=
q
[
i
];
}
// Step 6 (Optional):
// In general sense we need the original coordinates, so we
// need to shift the points back (reverting Step 2)
// But if we're only interested in getting the area/perimeter of the shape
// We can simply return.
if
(
!
shift_to_zero
)
{
for
(
int
i
=
0
;
i
<
m
;
i
++
)
{
q
[
i
]
+=
start
;
}
}
return
m
;
}
template
<
typename
T
>
HOST_DEVICE_INLINE
T
polygon_area
(
const
Point
<
T
>
(
&
q
)[
24
],
const
int
&
m
)
{
if
(
m
<=
2
)
{
return
0
;
}
T
area
=
0
;
for
(
int
i
=
1
;
i
<
m
-
1
;
i
++
)
{
area
+=
fabs
(
cross_2d
<
T
>
(
q
[
i
]
-
q
[
0
],
q
[
i
+
1
]
-
q
[
0
]));
}
return
area
/
2.0
;
}
template
<
typename
T
>
HOST_DEVICE_INLINE
T
rotated_boxes_intersection
(
const
RotatedBox
<
T
>&
box1
,
const
RotatedBox
<
T
>&
box2
)
{
// There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
// from rotated_rect_intersection_pts
Point
<
T
>
intersectPts
[
24
],
orderedPts
[
24
];
Point
<
T
>
pts1
[
4
];
Point
<
T
>
pts2
[
4
];
get_rotated_vertices
<
T
>
(
box1
,
pts1
);
get_rotated_vertices
<
T
>
(
box2
,
pts2
);
int
num
=
get_intersection_points
<
T
>
(
pts1
,
pts2
,
intersectPts
);
if
(
num
<=
2
)
{
return
0.0
;
}
// Convex Hull to order the intersection points in clockwise order and find
// the contour area.
int
num_convex
=
convex_hull_graham
<
T
>
(
intersectPts
,
num
,
orderedPts
,
true
);
return
polygon_area
<
T
>
(
orderedPts
,
num_convex
);
}
}
// namespace
template
<
typename
T
>
HOST_DEVICE_INLINE
T
single_box_iou_rotated
(
T
const
*
const
box1_raw
,
T
const
*
const
box2_raw
)
{
// shift center to the middle point to achieve higher precision in result
RotatedBox
<
T
>
box1
,
box2
;
auto
center_shift_x
=
(
box1_raw
[
0
]
+
box2_raw
[
0
])
/
2.0
;
auto
center_shift_y
=
(
box1_raw
[
1
]
+
box2_raw
[
1
])
/
2.0
;
box1
.
x_ctr
=
box1_raw
[
0
]
-
center_shift_x
;
box1
.
y_ctr
=
box1_raw
[
1
]
-
center_shift_y
;
box1
.
w
=
box1_raw
[
2
];
box1
.
h
=
box1_raw
[
3
];
box1
.
a
=
box1_raw
[
4
];
box2
.
x_ctr
=
box2_raw
[
0
]
-
center_shift_x
;
box2
.
y_ctr
=
box2_raw
[
1
]
-
center_shift_y
;
box2
.
w
=
box2_raw
[
2
];
box2
.
h
=
box2_raw
[
3
];
box2
.
a
=
box2_raw
[
4
];
T
area1
=
box1
.
w
*
box1
.
h
;
T
area2
=
box2
.
w
*
box2
.
h
;
if
(
area1
<
1e-14
||
area2
<
1e-14
)
{
return
0.
f
;
}
T
intersection
=
rotated_boxes_intersection
<
T
>
(
box1
,
box2
);
T
iou
=
intersection
/
(
area1
+
area2
-
intersection
);
return
iou
;
}
}
// namespace detectron2
Prev
1
…
12
13
14
15
16
17
18
19
20
21
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment