Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
d2go
Commits
f23248c0
Commit
f23248c0
authored
Mar 02, 2021
by
facebook-github-bot
Browse files
Initial commit
fbshipit-source-id: f4a8ba78691d8cf46e003ef0bd2e95f170932778
parents
Changes
172
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3423 additions
and
0 deletions
+3423
-0
projects_oss/detr/detr/datasets/coco_eval.py
projects_oss/detr/detr/datasets/coco_eval.py
+259
-0
projects_oss/detr/detr/datasets/coco_panoptic.py
projects_oss/detr/detr/datasets/coco_panoptic.py
+101
-0
projects_oss/detr/detr/datasets/panoptic_eval.py
projects_oss/detr/detr/datasets/panoptic_eval.py
+46
-0
projects_oss/detr/detr/datasets/transforms.py
projects_oss/detr/detr/datasets/transforms.py
+278
-0
projects_oss/detr/detr/functions/__init__.py
projects_oss/detr/detr/functions/__init__.py
+12
-0
projects_oss/detr/detr/functions/ms_deform_attn_func.py
projects_oss/detr/detr/functions/ms_deform_attn_func.py
+63
-0
projects_oss/detr/detr/hub.py
projects_oss/detr/detr/hub.py
+170
-0
projects_oss/detr/detr/models/__init__.py
projects_oss/detr/detr/models/__init__.py
+8
-0
projects_oss/detr/detr/models/backbone.py
projects_oss/detr/detr/models/backbone.py
+140
-0
projects_oss/detr/detr/models/deformable_detr.py
projects_oss/detr/detr/models/deformable_detr.py
+306
-0
projects_oss/detr/detr/models/deformable_transformer.py
projects_oss/detr/detr/models/deformable_transformer.py
+394
-0
projects_oss/detr/detr/models/detr.py
projects_oss/detr/detr/models/detr.py
+188
-0
projects_oss/detr/detr/models/matcher.py
projects_oss/detr/detr/models/matcher.py
+99
-0
projects_oss/detr/detr/models/position_encoding.py
projects_oss/detr/detr/models/position_encoding.py
+96
-0
projects_oss/detr/detr/models/segmentation.py
projects_oss/detr/detr/models/segmentation.py
+365
-0
projects_oss/detr/detr/models/setcriterion.py
projects_oss/detr/detr/models/setcriterion.py
+382
-0
projects_oss/detr/detr/models/transformer.py
projects_oss/detr/detr/models/transformer.py
+299
-0
projects_oss/detr/detr/modules/__init__.py
projects_oss/detr/detr/modules/__init__.py
+11
-0
projects_oss/detr/detr/modules/ms_deform_attn.py
projects_oss/detr/detr/modules/ms_deform_attn.py
+117
-0
projects_oss/detr/detr/runner.py
projects_oss/detr/detr/runner.py
+89
-0
No files found.
projects_oss/detr/detr/datasets/coco_eval.py
0 → 100644
View file @
f23248c0
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
COCO evaluator that works in distributed mode.
Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py
The difference is that there is less copy-pasting from pycocotools
in the end of the file, as python3 can suppress prints with contextlib
"""
import
os
import
contextlib
import
copy
import
numpy
as
np
import
torch
from
pycocotools.cocoeval
import
COCOeval
from
pycocotools.coco
import
COCO
import
pycocotools.mask
as
mask_util
from
util.misc
import
all_gather
class
CocoEvaluator
(
object
):
def
__init__
(
self
,
coco_gt
,
iou_types
):
assert
isinstance
(
iou_types
,
(
list
,
tuple
))
coco_gt
=
copy
.
deepcopy
(
coco_gt
)
self
.
coco_gt
=
coco_gt
self
.
iou_types
=
iou_types
self
.
coco_eval
=
{}
for
iou_type
in
iou_types
:
self
.
coco_eval
[
iou_type
]
=
COCOeval
(
coco_gt
,
iouType
=
iou_type
)
self
.
img_ids
=
[]
self
.
eval_imgs
=
{
k
:
[]
for
k
in
iou_types
}
def
update
(
self
,
predictions
):
img_ids
=
list
(
np
.
unique
(
list
(
predictions
.
keys
())))
self
.
img_ids
.
extend
(
img_ids
)
for
iou_type
in
self
.
iou_types
:
results
=
self
.
prepare
(
predictions
,
iou_type
)
# suppress pycocotools prints
with
open
(
os
.
devnull
,
'w'
)
as
devnull
:
with
contextlib
.
redirect_stdout
(
devnull
):
coco_dt
=
COCO
.
loadRes
(
self
.
coco_gt
,
results
)
if
results
else
COCO
()
coco_eval
=
self
.
coco_eval
[
iou_type
]
coco_eval
.
cocoDt
=
coco_dt
coco_eval
.
params
.
imgIds
=
list
(
img_ids
)
img_ids
,
eval_imgs
=
evaluate
(
coco_eval
)
self
.
eval_imgs
[
iou_type
].
append
(
eval_imgs
)
def
synchronize_between_processes
(
self
):
for
iou_type
in
self
.
iou_types
:
self
.
eval_imgs
[
iou_type
]
=
np
.
concatenate
(
self
.
eval_imgs
[
iou_type
],
2
)
create_common_coco_eval
(
self
.
coco_eval
[
iou_type
],
self
.
img_ids
,
self
.
eval_imgs
[
iou_type
])
def
accumulate
(
self
):
for
coco_eval
in
self
.
coco_eval
.
values
():
coco_eval
.
accumulate
()
def
summarize
(
self
):
for
iou_type
,
coco_eval
in
self
.
coco_eval
.
items
():
print
(
"IoU metric: {}"
.
format
(
iou_type
))
coco_eval
.
summarize
()
def
prepare
(
self
,
predictions
,
iou_type
):
if
iou_type
==
"bbox"
:
return
self
.
prepare_for_coco_detection
(
predictions
)
elif
iou_type
==
"segm"
:
return
self
.
prepare_for_coco_segmentation
(
predictions
)
elif
iou_type
==
"keypoints"
:
return
self
.
prepare_for_coco_keypoint
(
predictions
)
else
:
raise
ValueError
(
"Unknown iou type {}"
.
format
(
iou_type
))
def
prepare_for_coco_detection
(
self
,
predictions
):
coco_results
=
[]
for
original_id
,
prediction
in
predictions
.
items
():
if
len
(
prediction
)
==
0
:
continue
boxes
=
prediction
[
"boxes"
]
boxes
=
convert_to_xywh
(
boxes
).
tolist
()
scores
=
prediction
[
"scores"
].
tolist
()
labels
=
prediction
[
"labels"
].
tolist
()
coco_results
.
extend
(
[
{
"image_id"
:
original_id
,
"category_id"
:
labels
[
k
],
"bbox"
:
box
,
"score"
:
scores
[
k
],
}
for
k
,
box
in
enumerate
(
boxes
)
]
)
return
coco_results
def
prepare_for_coco_segmentation
(
self
,
predictions
):
coco_results
=
[]
for
original_id
,
prediction
in
predictions
.
items
():
if
len
(
prediction
)
==
0
:
continue
scores
=
prediction
[
"scores"
]
labels
=
prediction
[
"labels"
]
masks
=
prediction
[
"masks"
]
masks
=
masks
>
0.5
scores
=
prediction
[
"scores"
].
tolist
()
labels
=
prediction
[
"labels"
].
tolist
()
rles
=
[
mask_util
.
encode
(
np
.
array
(
mask
[
0
,
:,
:,
np
.
newaxis
],
dtype
=
np
.
uint8
,
order
=
"F"
))[
0
]
for
mask
in
masks
]
for
rle
in
rles
:
rle
[
"counts"
]
=
rle
[
"counts"
].
decode
(
"utf-8"
)
coco_results
.
extend
(
[
{
"image_id"
:
original_id
,
"category_id"
:
labels
[
k
],
"segmentation"
:
rle
,
"score"
:
scores
[
k
],
}
for
k
,
rle
in
enumerate
(
rles
)
]
)
return
coco_results
def
prepare_for_coco_keypoint
(
self
,
predictions
):
coco_results
=
[]
for
original_id
,
prediction
in
predictions
.
items
():
if
len
(
prediction
)
==
0
:
continue
boxes
=
prediction
[
"boxes"
]
boxes
=
convert_to_xywh
(
boxes
).
tolist
()
scores
=
prediction
[
"scores"
].
tolist
()
labels
=
prediction
[
"labels"
].
tolist
()
keypoints
=
prediction
[
"keypoints"
]
keypoints
=
keypoints
.
flatten
(
start_dim
=
1
).
tolist
()
coco_results
.
extend
(
[
{
"image_id"
:
original_id
,
"category_id"
:
labels
[
k
],
'keypoints'
:
keypoint
,
"score"
:
scores
[
k
],
}
for
k
,
keypoint
in
enumerate
(
keypoints
)
]
)
return
coco_results
def
convert_to_xywh
(
boxes
):
xmin
,
ymin
,
xmax
,
ymax
=
boxes
.
unbind
(
1
)
return
torch
.
stack
((
xmin
,
ymin
,
xmax
-
xmin
,
ymax
-
ymin
),
dim
=
1
)
def
merge
(
img_ids
,
eval_imgs
):
all_img_ids
=
all_gather
(
img_ids
)
all_eval_imgs
=
all_gather
(
eval_imgs
)
merged_img_ids
=
[]
for
p
in
all_img_ids
:
merged_img_ids
.
extend
(
p
)
merged_eval_imgs
=
[]
for
p
in
all_eval_imgs
:
merged_eval_imgs
.
append
(
p
)
merged_img_ids
=
np
.
array
(
merged_img_ids
)
merged_eval_imgs
=
np
.
concatenate
(
merged_eval_imgs
,
2
)
# keep only unique (and in sorted order) images
merged_img_ids
,
idx
=
np
.
unique
(
merged_img_ids
,
return_index
=
True
)
merged_eval_imgs
=
merged_eval_imgs
[...,
idx
]
return
merged_img_ids
,
merged_eval_imgs
def
create_common_coco_eval
(
coco_eval
,
img_ids
,
eval_imgs
):
img_ids
,
eval_imgs
=
merge
(
img_ids
,
eval_imgs
)
img_ids
=
list
(
img_ids
)
eval_imgs
=
list
(
eval_imgs
.
flatten
())
coco_eval
.
evalImgs
=
eval_imgs
coco_eval
.
params
.
imgIds
=
img_ids
coco_eval
.
_paramsEval
=
copy
.
deepcopy
(
coco_eval
.
params
)
#################################################################
# From pycocotools, just removed the prints and fixed
# a Python3 bug about unicode not defined
#################################################################
def
evaluate
(
self
):
'''
Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
:return: None
'''
# tic = time.time()
# print('Running per image evaluation...')
p
=
self
.
params
# add backward compatibility if useSegm is specified in params
if
p
.
useSegm
is
not
None
:
p
.
iouType
=
'segm'
if
p
.
useSegm
==
1
else
'bbox'
print
(
'useSegm (deprecated) is not None. Running {} evaluation'
.
format
(
p
.
iouType
))
# print('Evaluate annotation type *{}*'.format(p.iouType))
p
.
imgIds
=
list
(
np
.
unique
(
p
.
imgIds
))
if
p
.
useCats
:
p
.
catIds
=
list
(
np
.
unique
(
p
.
catIds
))
p
.
maxDets
=
sorted
(
p
.
maxDets
)
self
.
params
=
p
self
.
_prepare
()
# loop through images, area range, max detection number
catIds
=
p
.
catIds
if
p
.
useCats
else
[
-
1
]
if
p
.
iouType
==
'segm'
or
p
.
iouType
==
'bbox'
:
computeIoU
=
self
.
computeIoU
elif
p
.
iouType
==
'keypoints'
:
computeIoU
=
self
.
computeOks
self
.
ious
=
{
(
imgId
,
catId
):
computeIoU
(
imgId
,
catId
)
for
imgId
in
p
.
imgIds
for
catId
in
catIds
}
evaluateImg
=
self
.
evaluateImg
maxDet
=
p
.
maxDets
[
-
1
]
evalImgs
=
[
evaluateImg
(
imgId
,
catId
,
areaRng
,
maxDet
)
for
catId
in
catIds
for
areaRng
in
p
.
areaRng
for
imgId
in
p
.
imgIds
]
# this is NOT in the pycocotools code, but could be done outside
evalImgs
=
np
.
asarray
(
evalImgs
).
reshape
(
len
(
catIds
),
len
(
p
.
areaRng
),
len
(
p
.
imgIds
))
self
.
_paramsEval
=
copy
.
deepcopy
(
self
.
params
)
# toc = time.time()
# print('DONE (t={:0.2f}s).'.format(toc-tic))
return
p
.
imgIds
,
evalImgs
#################################################################
# end of straight copy from pycocotools, just removing the prints
#################################################################
projects_oss/detr/detr/datasets/coco_panoptic.py
0 → 100644
View file @
f23248c0
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import
json
from
pathlib
import
Path
import
numpy
as
np
import
torch
from
PIL
import
Image
from
panopticapi.utils
import
rgb2id
from
util.box_ops
import
masks_to_boxes
from
.coco
import
make_coco_transforms
class
CocoPanoptic
:
def
__init__
(
self
,
img_folder
,
ann_folder
,
ann_file
,
transforms
=
None
,
return_masks
=
True
):
with
open
(
ann_file
,
'r'
)
as
f
:
self
.
coco
=
json
.
load
(
f
)
# sort 'images' field so that they are aligned with 'annotations'
# i.e., in alphabetical order
self
.
coco
[
'images'
]
=
sorted
(
self
.
coco
[
'images'
],
key
=
lambda
x
:
x
[
'id'
])
# sanity check
if
"annotations"
in
self
.
coco
:
for
img
,
ann
in
zip
(
self
.
coco
[
'images'
],
self
.
coco
[
'annotations'
]):
assert
img
[
'file_name'
][:
-
4
]
==
ann
[
'file_name'
][:
-
4
]
self
.
img_folder
=
img_folder
self
.
ann_folder
=
ann_folder
self
.
ann_file
=
ann_file
self
.
transforms
=
transforms
self
.
return_masks
=
return_masks
def
__getitem__
(
self
,
idx
):
ann_info
=
self
.
coco
[
'annotations'
][
idx
]
if
"annotations"
in
self
.
coco
else
self
.
coco
[
'images'
][
idx
]
img_path
=
Path
(
self
.
img_folder
)
/
ann_info
[
'file_name'
].
replace
(
'.png'
,
'.jpg'
)
ann_path
=
Path
(
self
.
ann_folder
)
/
ann_info
[
'file_name'
]
img
=
Image
.
open
(
img_path
).
convert
(
'RGB'
)
w
,
h
=
img
.
size
if
"segments_info"
in
ann_info
:
masks
=
np
.
asarray
(
Image
.
open
(
ann_path
),
dtype
=
np
.
uint32
)
masks
=
rgb2id
(
masks
)
ids
=
np
.
array
([
ann
[
'id'
]
for
ann
in
ann_info
[
'segments_info'
]])
masks
=
masks
==
ids
[:,
None
,
None
]
masks
=
torch
.
as_tensor
(
masks
,
dtype
=
torch
.
uint8
)
labels
=
torch
.
tensor
([
ann
[
'category_id'
]
for
ann
in
ann_info
[
'segments_info'
]],
dtype
=
torch
.
int64
)
target
=
{}
target
[
'image_id'
]
=
torch
.
tensor
([
ann_info
[
'image_id'
]
if
"image_id"
in
ann_info
else
ann_info
[
"id"
]])
if
self
.
return_masks
:
target
[
'masks'
]
=
masks
target
[
'labels'
]
=
labels
target
[
"boxes"
]
=
masks_to_boxes
(
masks
)
target
[
'size'
]
=
torch
.
as_tensor
([
int
(
h
),
int
(
w
)])
target
[
'orig_size'
]
=
torch
.
as_tensor
([
int
(
h
),
int
(
w
)])
if
"segments_info"
in
ann_info
:
for
name
in
[
'iscrowd'
,
'area'
]:
target
[
name
]
=
torch
.
tensor
([
ann
[
name
]
for
ann
in
ann_info
[
'segments_info'
]])
if
self
.
transforms
is
not
None
:
img
,
target
=
self
.
transforms
(
img
,
target
)
return
img
,
target
def
__len__
(
self
):
return
len
(
self
.
coco
[
'images'
])
def
get_height_and_width
(
self
,
idx
):
img_info
=
self
.
coco
[
'images'
][
idx
]
height
=
img_info
[
'height'
]
width
=
img_info
[
'width'
]
return
height
,
width
def
build
(
image_set
,
args
):
img_folder_root
=
Path
(
args
.
coco_path
)
ann_folder_root
=
Path
(
args
.
coco_panoptic_path
)
assert
img_folder_root
.
exists
(),
f
'provided COCO path
{
img_folder_root
}
does not exist'
assert
ann_folder_root
.
exists
(),
f
'provided COCO path
{
ann_folder_root
}
does not exist'
mode
=
'panoptic'
PATHS
=
{
"train"
:
(
"train2017"
,
Path
(
"annotations"
)
/
f
'
{
mode
}
_train2017.json'
),
"val"
:
(
"val2017"
,
Path
(
"annotations"
)
/
f
'
{
mode
}
_val2017.json'
),
}
img_folder
,
ann_file
=
PATHS
[
image_set
]
img_folder_path
=
img_folder_root
/
img_folder
ann_folder
=
ann_folder_root
/
f
'
{
mode
}
_
{
img_folder
}
'
ann_file
=
ann_folder_root
/
ann_file
dataset
=
CocoPanoptic
(
img_folder_path
,
ann_folder
,
ann_file
,
transforms
=
make_coco_transforms
(
image_set
),
return_masks
=
args
.
masks
)
return
dataset
projects_oss/detr/detr/datasets/panoptic_eval.py
0 → 100644
View file @
f23248c0
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import
json
import
os
import
util.misc
as
utils
try
:
from
panopticapi.evaluation
import
pq_compute
except
ImportError
:
pass
class
PanopticEvaluator
(
object
):
def
__init__
(
self
,
ann_file
,
ann_folder
,
output_dir
=
"panoptic_eval"
):
self
.
gt_json
=
ann_file
self
.
gt_folder
=
ann_folder
if
utils
.
is_main_process
():
if
not
os
.
path
.
exists
(
output_dir
):
os
.
mkdir
(
output_dir
)
self
.
output_dir
=
output_dir
self
.
predictions
=
[]
def
update
(
self
,
predictions
):
for
p
in
predictions
:
with
open
(
os
.
path
.
join
(
self
.
output_dir
,
p
[
"file_name"
]),
"wb"
)
as
f
:
f
.
write
(
p
.
pop
(
"png_string"
))
self
.
predictions
+=
predictions
def
synchronize_between_processes
(
self
):
all_predictions
=
utils
.
all_gather
(
self
.
predictions
)
merged_predictions
=
[]
for
p
in
all_predictions
:
merged_predictions
+=
p
self
.
predictions
=
merged_predictions
def
summarize
(
self
):
if
utils
.
is_main_process
():
json_data
=
{
"annotations"
:
self
.
predictions
}
predictions_json
=
os
.
path
.
join
(
self
.
output_dir
,
"predictions.json"
)
with
open
(
predictions_json
,
"w"
)
as
f
:
f
.
write
(
json
.
dumps
(
json_data
))
return
pq_compute
(
self
.
gt_json
,
predictions_json
,
gt_folder
=
self
.
gt_folder
,
pred_folder
=
self
.
output_dir
)
return
None
projects_oss/detr/detr/datasets/transforms.py
0 → 100644
View file @
f23248c0
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
Transforms and data augmentation for both image + bbox.
"""
import
random
import
PIL
import
torch
import
torchvision.transforms
as
T
import
torchvision.transforms.functional
as
F
from
detr.util.box_ops
import
box_xyxy_to_cxcywh
from
detr.util.misc
import
interpolate
def
crop
(
image
,
target
,
region
):
cropped_image
=
F
.
crop
(
image
,
*
region
)
target
=
target
.
copy
()
i
,
j
,
h
,
w
=
region
# should we do something wrt the original size?
target
[
"size"
]
=
torch
.
tensor
([
h
,
w
])
fields
=
[
"labels"
,
"area"
,
"iscrowd"
]
if
"boxes"
in
target
:
boxes
=
target
[
"boxes"
]
max_size
=
torch
.
as_tensor
([
w
,
h
],
dtype
=
torch
.
float32
)
cropped_boxes
=
boxes
-
torch
.
as_tensor
([
j
,
i
,
j
,
i
])
cropped_boxes
=
torch
.
min
(
cropped_boxes
.
reshape
(
-
1
,
2
,
2
),
max_size
)
cropped_boxes
=
cropped_boxes
.
clamp
(
min
=
0
)
area
=
(
cropped_boxes
[:,
1
,
:]
-
cropped_boxes
[:,
0
,
:]).
prod
(
dim
=
1
)
target
[
"boxes"
]
=
cropped_boxes
.
reshape
(
-
1
,
4
)
target
[
"area"
]
=
area
fields
.
append
(
"boxes"
)
if
"masks"
in
target
:
# FIXME should we update the area here if there are no boxes?
target
[
'masks'
]
=
target
[
'masks'
][:,
i
:
i
+
h
,
j
:
j
+
w
]
fields
.
append
(
"masks"
)
# remove elements for which the boxes or masks that have zero area
if
"boxes"
in
target
or
"masks"
in
target
:
# favor boxes selection when defining which elements to keep
# this is compatible with previous implementation
if
"boxes"
in
target
:
cropped_boxes
=
target
[
'boxes'
].
reshape
(
-
1
,
2
,
2
)
keep
=
torch
.
all
(
cropped_boxes
[:,
1
,
:]
>
cropped_boxes
[:,
0
,
:],
dim
=
1
)
else
:
keep
=
target
[
'masks'
].
flatten
(
1
).
any
(
1
)
for
field
in
fields
:
target
[
field
]
=
target
[
field
][
keep
]
return
cropped_image
,
target
def
hflip
(
image
,
target
):
flipped_image
=
F
.
hflip
(
image
)
w
,
h
=
image
.
size
target
=
target
.
copy
()
if
"boxes"
in
target
:
boxes
=
target
[
"boxes"
]
boxes
=
boxes
[:,
[
2
,
1
,
0
,
3
]]
*
torch
.
as_tensor
([
-
1
,
1
,
-
1
,
1
])
+
torch
.
as_tensor
([
w
,
0
,
w
,
0
])
target
[
"boxes"
]
=
boxes
if
"masks"
in
target
:
target
[
'masks'
]
=
target
[
'masks'
].
flip
(
-
1
)
return
flipped_image
,
target
def
resize
(
image
,
target
,
size
,
max_size
=
None
):
# size can be min_size (scalar) or (w, h) tuple
def
get_size_with_aspect_ratio
(
image_size
,
size
,
max_size
=
None
):
w
,
h
=
image_size
if
max_size
is
not
None
:
min_original_size
=
float
(
min
((
w
,
h
)))
max_original_size
=
float
(
max
((
w
,
h
)))
if
max_original_size
/
min_original_size
*
size
>
max_size
:
size
=
int
(
round
(
max_size
*
min_original_size
/
max_original_size
))
if
(
w
<=
h
and
w
==
size
)
or
(
h
<=
w
and
h
==
size
):
return
(
h
,
w
)
if
w
<
h
:
ow
=
size
oh
=
int
(
size
*
h
/
w
)
else
:
oh
=
size
ow
=
int
(
size
*
w
/
h
)
return
(
oh
,
ow
)
def
get_size
(
image_size
,
size
,
max_size
=
None
):
if
isinstance
(
size
,
(
list
,
tuple
)):
return
size
[::
-
1
]
else
:
return
get_size_with_aspect_ratio
(
image_size
,
size
,
max_size
)
size
=
get_size
(
image
.
size
,
size
,
max_size
)
rescaled_image
=
F
.
resize
(
image
,
size
)
if
target
is
None
:
return
rescaled_image
,
None
ratios
=
tuple
(
float
(
s
)
/
float
(
s_orig
)
for
s
,
s_orig
in
zip
(
rescaled_image
.
size
,
image
.
size
))
ratio_width
,
ratio_height
=
ratios
target
=
target
.
copy
()
if
"boxes"
in
target
:
boxes
=
target
[
"boxes"
]
scaled_boxes
=
boxes
*
torch
.
as_tensor
([
ratio_width
,
ratio_height
,
ratio_width
,
ratio_height
])
target
[
"boxes"
]
=
scaled_boxes
if
"area"
in
target
:
area
=
target
[
"area"
]
scaled_area
=
area
*
(
ratio_width
*
ratio_height
)
target
[
"area"
]
=
scaled_area
h
,
w
=
size
target
[
"size"
]
=
torch
.
tensor
([
h
,
w
])
if
"masks"
in
target
:
target
[
'masks'
]
=
interpolate
(
target
[
'masks'
][:,
None
].
float
(),
size
,
mode
=
"nearest"
)[:,
0
]
>
0.5
return
rescaled_image
,
target
def
pad
(
image
,
target
,
padding
):
# assumes that we only pad on the bottom right corners
padded_image
=
F
.
pad
(
image
,
(
0
,
0
,
padding
[
0
],
padding
[
1
]))
if
target
is
None
:
return
padded_image
,
None
target
=
target
.
copy
()
# should we do something wrt the original size?
target
[
"size"
]
=
torch
.
tensor
(
padded_image
.
size
[::
-
1
])
if
"masks"
in
target
:
target
[
'masks'
]
=
torch
.
nn
.
functional
.
pad
(
target
[
'masks'
],
(
0
,
padding
[
0
],
0
,
padding
[
1
]))
return
padded_image
,
target
class
RandomCrop
(
object
):
def
__init__
(
self
,
size
):
self
.
size
=
size
def
__call__
(
self
,
img
,
target
):
region
=
T
.
RandomCrop
.
get_params
(
img
,
self
.
size
)
return
crop
(
img
,
target
,
region
)
class
RandomSizeCrop
(
object
):
def
__init__
(
self
,
min_size
:
int
,
max_size
:
int
):
self
.
min_size
=
min_size
self
.
max_size
=
max_size
def
__call__
(
self
,
img
:
PIL
.
Image
.
Image
,
target
:
dict
):
#noqa: P210
w
=
random
.
randint
(
self
.
min_size
,
min
(
img
.
width
,
self
.
max_size
))
h
=
random
.
randint
(
self
.
min_size
,
min
(
img
.
height
,
self
.
max_size
))
region
=
T
.
RandomCrop
.
get_params
(
img
,
[
h
,
w
])
return
crop
(
img
,
target
,
region
)
class
CenterCrop
(
object
):
def
__init__
(
self
,
size
):
self
.
size
=
size
def
__call__
(
self
,
img
,
target
):
image_width
,
image_height
=
img
.
size
crop_height
,
crop_width
=
self
.
size
crop_top
=
int
(
round
((
image_height
-
crop_height
)
/
2.
))
crop_left
=
int
(
round
((
image_width
-
crop_width
)
/
2.
))
return
crop
(
img
,
target
,
(
crop_top
,
crop_left
,
crop_height
,
crop_width
))
class
RandomHorizontalFlip
(
object
):
def
__init__
(
self
,
p
=
0.5
):
self
.
p
=
p
def
__call__
(
self
,
img
,
target
):
if
random
.
random
()
<
self
.
p
:
return
hflip
(
img
,
target
)
return
img
,
target
class
RandomResize
(
object
):
def
__init__
(
self
,
sizes
,
max_size
=
None
):
assert
isinstance
(
sizes
,
(
list
,
tuple
))
self
.
sizes
=
sizes
self
.
max_size
=
max_size
def
__call__
(
self
,
img
,
target
=
None
):
size
=
random
.
choice
(
self
.
sizes
)
return
resize
(
img
,
target
,
size
,
self
.
max_size
)
class
RandomPad
(
object
):
def
__init__
(
self
,
max_pad
):
self
.
max_pad
=
max_pad
def
__call__
(
self
,
img
,
target
):
pad_x
=
random
.
randint
(
0
,
self
.
max_pad
)
pad_y
=
random
.
randint
(
0
,
self
.
max_pad
)
return
pad
(
img
,
target
,
(
pad_x
,
pad_y
))
class
RandomSelect
(
object
):
"""
Randomly selects between transforms1 and transforms2,
with probability p for transforms1 and (1 - p) for transforms2
"""
def
__init__
(
self
,
transforms1
,
transforms2
,
p
=
0.5
):
self
.
transforms1
=
transforms1
self
.
transforms2
=
transforms2
self
.
p
=
p
def
__call__
(
self
,
img
,
target
):
if
random
.
random
()
<
self
.
p
:
return
self
.
transforms1
(
img
,
target
)
return
self
.
transforms2
(
img
,
target
)
class
ToTensor
(
object
):
def
__call__
(
self
,
img
,
target
):
return
F
.
to_tensor
(
img
),
target
class
RandomErasing
(
object
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
self
.
eraser
=
T
.
RandomErasing
(
*
args
,
**
kwargs
)
def
__call__
(
self
,
img
,
target
):
return
self
.
eraser
(
img
),
target
class
Normalize
(
object
):
def
__init__
(
self
,
mean
,
std
):
self
.
mean
=
mean
self
.
std
=
std
def
__call__
(
self
,
image
,
target
=
None
):
image
=
F
.
normalize
(
image
,
mean
=
self
.
mean
,
std
=
self
.
std
)
if
target
is
None
:
return
image
,
None
target
=
target
.
copy
()
h
,
w
=
image
.
shape
[
-
2
:]
if
"boxes"
in
target
:
boxes
=
target
[
"boxes"
]
boxes
=
box_xyxy_to_cxcywh
(
boxes
)
boxes
=
boxes
/
torch
.
tensor
([
w
,
h
,
w
,
h
],
dtype
=
torch
.
float32
)
target
[
"boxes"
]
=
boxes
return
image
,
target
class
Compose
(
object
):
def
__init__
(
self
,
transforms
):
self
.
transforms
=
transforms
def
__call__
(
self
,
image
,
target
):
for
t
in
self
.
transforms
:
image
,
target
=
t
(
image
,
target
)
return
image
,
target
def
__repr__
(
self
):
format_string
=
self
.
__class__
.
__name__
+
"("
for
t
in
self
.
transforms
:
format_string
+=
"
\n
"
format_string
+=
" {0}"
.
format
(
t
)
format_string
+=
"
\n
)"
return
format_string
projects_oss/detr/detr/functions/__init__.py
0 → 100644
View file @
f23248c0
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
from
.ms_deform_attn_func
import
MSDeformAttnFunction
projects_oss/detr/detr/functions/ms_deform_attn_func.py
0 → 100644
View file @
f23248c0
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
from
__future__
import
absolute_import
from
__future__
import
print_function
from
__future__
import
division
import
torch
import
torch.nn.functional
as
F
from
torch.autograd
import
Function
from
torch.autograd.function
import
once_differentiable
from
detr
import
_C
as
MSDA
class
MSDeformAttnFunction
(
Function
):
@
staticmethod
def
forward
(
ctx
,
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
,
im2col_step
):
ctx
.
im2col_step
=
im2col_step
output
=
MSDA
.
ms_deform_attn_forward
(
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
,
ctx
.
im2col_step
)
ctx
.
save_for_backward
(
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
)
return
output
@
staticmethod
@
once_differentiable
def
backward
(
ctx
,
grad_output
):
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
=
ctx
.
saved_tensors
grad_value
,
grad_sampling_loc
,
grad_attn_weight
=
\
MSDA
.
ms_deform_attn_backward
(
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
,
grad_output
,
ctx
.
im2col_step
)
return
grad_value
,
None
,
None
,
grad_sampling_loc
,
grad_attn_weight
,
None
def
ms_deform_attn_core_pytorch
(
value
,
value_spatial_shapes
,
sampling_locations
,
attention_weights
):
# for debug and test only,
# need to use cuda version instead
N_
,
S_
,
M_
,
D_
=
value
.
shape
_
,
Lq_
,
M_
,
L_
,
P_
,
_
=
sampling_locations
.
shape
value_list
=
value
.
split
([
H_
*
W_
for
H_
,
W_
in
value_spatial_shapes
],
dim
=
1
)
sampling_grids
=
2
*
sampling_locations
-
1
sampling_value_list
=
[]
for
lid_
,
(
H_
,
W_
)
in
enumerate
(
value_spatial_shapes
):
# N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
value_l_
=
value_list
[
lid_
].
flatten
(
2
).
transpose
(
1
,
2
).
reshape
(
N_
*
M_
,
D_
,
H_
,
W_
)
# N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
sampling_grid_l_
=
sampling_grids
[:,
:,
:,
lid_
].
transpose
(
1
,
2
).
flatten
(
0
,
1
)
# N_*M_, D_, Lq_, P_
sampling_value_l_
=
F
.
grid_sample
(
value_l_
,
sampling_grid_l_
,
mode
=
'bilinear'
,
padding_mode
=
'zeros'
,
align_corners
=
False
)
sampling_value_list
.
append
(
sampling_value_l_
)
# (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
attention_weights
=
attention_weights
.
transpose
(
1
,
2
).
reshape
(
N_
*
M_
,
1
,
Lq_
,
L_
*
P_
)
output
=
(
torch
.
stack
(
sampling_value_list
,
dim
=-
2
).
flatten
(
-
2
)
*
attention_weights
).
sum
(
-
1
).
view
(
N_
,
M_
*
D_
,
Lq_
)
return
output
.
transpose
(
1
,
2
).
contiguous
()
projects_oss/detr/detr/hub.py
0 → 100644
View file @
f23248c0
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import
torch
from
detr.models.backbone
import
Backbone
,
Joiner
from
detr.models.detr
import
DETR
,
PostProcess
from
detr.models.position_encoding
import
PositionEmbeddingSine
from
detr.models.segmentation
import
DETRsegm
,
PostProcessPanoptic
from
detr.models.transformer
import
Transformer
dependencies
=
[
"torch"
,
"torchvision"
]
def
_make_detr
(
backbone_name
:
str
,
dilation
=
False
,
num_classes
=
91
,
mask
=
False
):
hidden_dim
=
256
backbone
=
Backbone
(
backbone_name
,
train_backbone
=
True
,
return_interm_layers
=
mask
,
dilation
=
dilation
)
pos_enc
=
PositionEmbeddingSine
(
hidden_dim
//
2
,
normalize
=
True
)
backbone_with_pos_enc
=
Joiner
(
backbone
,
pos_enc
)
backbone_with_pos_enc
.
num_channels
=
backbone
.
num_channels
transformer
=
Transformer
(
d_model
=
hidden_dim
,
return_intermediate_dec
=
True
)
detr
=
DETR
(
backbone_with_pos_enc
,
transformer
,
num_classes
=
num_classes
,
num_queries
=
100
)
if
mask
:
return
DETRsegm
(
detr
)
return
detr
def
detr_resnet50
(
pretrained
=
False
,
num_classes
=
91
,
return_postprocessor
=
False
):
"""
DETR R50 with 6 encoder and 6 decoder layers.
Achieves 42/62.4 AP/AP50 on COCO val5k.
"""
model
=
_make_detr
(
"resnet50"
,
dilation
=
False
,
num_classes
=
num_classes
)
if
pretrained
:
checkpoint
=
torch
.
hub
.
load_state_dict_from_url
(
url
=
"https://dl.fbaipublicfiles.com/detr/detr-r50-e632da11.pth"
,
map_location
=
"cpu"
,
check_hash
=
True
)
model
.
load_state_dict
(
checkpoint
[
"model"
])
if
return_postprocessor
:
return
model
,
PostProcess
()
return
model
def
detr_resnet50_dc5
(
pretrained
=
False
,
num_classes
=
91
,
return_postprocessor
=
False
):
"""
DETR-DC5 R50 with 6 encoder and 6 decoder layers.
The last block of ResNet-50 has dilation to increase
output resolution.
Achieves 43.3/63.1 AP/AP50 on COCO val5k.
"""
model
=
_make_detr
(
"resnet50"
,
dilation
=
True
,
num_classes
=
num_classes
)
if
pretrained
:
checkpoint
=
torch
.
hub
.
load_state_dict_from_url
(
url
=
"https://dl.fbaipublicfiles.com/detr/detr-r50-dc5-f0fb7ef5.pth"
,
map_location
=
"cpu"
,
check_hash
=
True
)
model
.
load_state_dict
(
checkpoint
[
"model"
])
if
return_postprocessor
:
return
model
,
PostProcess
()
return
model
def
detr_resnet101
(
pretrained
=
False
,
num_classes
=
91
,
return_postprocessor
=
False
):
"""
DETR-DC5 R101 with 6 encoder and 6 decoder layers.
Achieves 43.5/63.8 AP/AP50 on COCO val5k.
"""
model
=
_make_detr
(
"resnet101"
,
dilation
=
False
,
num_classes
=
num_classes
)
if
pretrained
:
checkpoint
=
torch
.
hub
.
load_state_dict_from_url
(
url
=
"https://dl.fbaipublicfiles.com/detr/detr-r101-2c7b67e5.pth"
,
map_location
=
"cpu"
,
check_hash
=
True
)
model
.
load_state_dict
(
checkpoint
[
"model"
])
if
return_postprocessor
:
return
model
,
PostProcess
()
return
model
def
detr_resnet101_dc5
(
pretrained
=
False
,
num_classes
=
91
,
return_postprocessor
=
False
):
"""
DETR-DC5 R101 with 6 encoder and 6 decoder layers.
The last block of ResNet-101 has dilation to increase
output resolution.
Achieves 44.9/64.7 AP/AP50 on COCO val5k.
"""
model
=
_make_detr
(
"resnet101"
,
dilation
=
True
,
num_classes
=
num_classes
)
if
pretrained
:
checkpoint
=
torch
.
hub
.
load_state_dict_from_url
(
url
=
"https://dl.fbaipublicfiles.com/detr/detr-r101-dc5-a2e86def.pth"
,
map_location
=
"cpu"
,
check_hash
=
True
)
model
.
load_state_dict
(
checkpoint
[
"model"
])
if
return_postprocessor
:
return
model
,
PostProcess
()
return
model
def
detr_resnet50_panoptic
(
pretrained
=
False
,
num_classes
=
250
,
threshold
=
0.85
,
return_postprocessor
=
False
):
"""
DETR R50 with 6 encoder and 6 decoder layers.
Achieves 43.4 PQ on COCO val5k.
threshold is the minimum confidence required for keeping segments in the prediction
"""
model
=
_make_detr
(
"resnet50"
,
dilation
=
False
,
num_classes
=
num_classes
,
mask
=
True
)
is_thing_map
=
{
i
:
i
<=
90
for
i
in
range
(
250
)}
if
pretrained
:
checkpoint
=
torch
.
hub
.
load_state_dict_from_url
(
url
=
"https://dl.fbaipublicfiles.com/detr/detr-r50-panoptic-00ce5173.pth"
,
map_location
=
"cpu"
,
check_hash
=
True
,
)
model
.
load_state_dict
(
checkpoint
[
"model"
])
if
return_postprocessor
:
return
model
,
PostProcessPanoptic
(
is_thing_map
,
threshold
=
threshold
)
return
model
def
detr_resnet50_dc5_panoptic
(
pretrained
=
False
,
num_classes
=
250
,
threshold
=
0.85
,
return_postprocessor
=
False
):
"""
DETR-DC5 R50 with 6 encoder and 6 decoder layers.
The last block of ResNet-50 has dilation to increase
output resolution.
Achieves 44.6 on COCO val5k.
threshold is the minimum confidence required for keeping segments in the prediction
"""
model
=
_make_detr
(
"resnet50"
,
dilation
=
True
,
num_classes
=
num_classes
,
mask
=
True
)
is_thing_map
=
{
i
:
i
<=
90
for
i
in
range
(
250
)}
if
pretrained
:
checkpoint
=
torch
.
hub
.
load_state_dict_from_url
(
url
=
"https://dl.fbaipublicfiles.com/detr/detr-r50-dc5-panoptic-da08f1b1.pth"
,
map_location
=
"cpu"
,
check_hash
=
True
,
)
model
.
load_state_dict
(
checkpoint
[
"model"
])
if
return_postprocessor
:
return
model
,
PostProcessPanoptic
(
is_thing_map
,
threshold
=
threshold
)
return
model
def
detr_resnet101_panoptic
(
pretrained
=
False
,
num_classes
=
250
,
threshold
=
0.85
,
return_postprocessor
=
False
):
"""
DETR-DC5 R101 with 6 encoder and 6 decoder layers.
Achieves 45.1 PQ on COCO val5k.
threshold is the minimum confidence required for keeping segments in the prediction
"""
model
=
_make_detr
(
"resnet101"
,
dilation
=
False
,
num_classes
=
num_classes
,
mask
=
True
)
is_thing_map
=
{
i
:
i
<=
90
for
i
in
range
(
250
)}
if
pretrained
:
checkpoint
=
torch
.
hub
.
load_state_dict_from_url
(
url
=
"https://dl.fbaipublicfiles.com/detr/detr-r101-panoptic-40021d53.pth"
,
map_location
=
"cpu"
,
check_hash
=
True
,
)
model
.
load_state_dict
(
checkpoint
[
"model"
])
if
return_postprocessor
:
return
model
,
PostProcessPanoptic
(
is_thing_map
,
threshold
=
threshold
)
return
model
projects_oss/detr/detr/models/__init__.py
0 → 100644
View file @
f23248c0
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from
.detr
import
build
def
build_model
(
args
):
return
build
(
args
)
projects_oss/detr/detr/models/backbone.py
0 → 100644
View file @
f23248c0
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# ------------------------------------------------------------------------
# Modified from Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------
# Modified from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# ------------------------------------------------------------------------
"""
Backbone modules.
"""
from
collections
import
OrderedDict
import
torch
import
torch.nn.functional
as
F
import
torchvision
from
torch
import
nn
from
torchvision.models._utils
import
IntermediateLayerGetter
from
typing
import
Dict
,
List
from
detr.util.misc
import
NestedTensor
,
is_main_process
from
.position_encoding
import
build_position_encoding
class
FrozenBatchNorm2d
(
torch
.
nn
.
Module
):
"""
BatchNorm2d where the batch statistics and the affine parameters are fixed.
Copy-paste from torchvision.misc.ops with added eps before rqsrt,
without which any other models than torchvision.models.resnet[18,34,50,101]
produce nans.
"""
def
__init__
(
self
,
n
,
eps
=
1e-5
):
super
(
FrozenBatchNorm2d
,
self
).
__init__
()
self
.
register_buffer
(
"weight"
,
torch
.
ones
(
n
))
self
.
register_buffer
(
"bias"
,
torch
.
zeros
(
n
))
self
.
register_buffer
(
"running_mean"
,
torch
.
zeros
(
n
))
self
.
register_buffer
(
"running_var"
,
torch
.
ones
(
n
))
self
.
eps
=
eps
def
_load_from_state_dict
(
self
,
state_dict
,
prefix
,
local_metadata
,
strict
,
missing_keys
,
unexpected_keys
,
error_msgs
):
num_batches_tracked_key
=
prefix
+
'num_batches_tracked'
if
num_batches_tracked_key
in
state_dict
:
del
state_dict
[
num_batches_tracked_key
]
super
(
FrozenBatchNorm2d
,
self
).
_load_from_state_dict
(
state_dict
,
prefix
,
local_metadata
,
strict
,
missing_keys
,
unexpected_keys
,
error_msgs
)
def
forward
(
self
,
x
):
# move reshapes to the beginning
# to make it fuser-friendly
w
=
self
.
weight
.
reshape
(
1
,
-
1
,
1
,
1
)
b
=
self
.
bias
.
reshape
(
1
,
-
1
,
1
,
1
)
rv
=
self
.
running_var
.
reshape
(
1
,
-
1
,
1
,
1
)
rm
=
self
.
running_mean
.
reshape
(
1
,
-
1
,
1
,
1
)
eps
=
self
.
eps
scale
=
w
*
(
rv
+
eps
).
rsqrt
()
bias
=
b
-
rm
*
scale
return
x
*
scale
+
bias
class
BackboneBase
(
nn
.
Module
):
def
__init__
(
self
,
backbone
:
nn
.
Module
,
train_backbone
:
bool
,
return_interm_layers
:
bool
):
super
().
__init__
()
for
name
,
parameter
in
backbone
.
named_parameters
():
if
not
train_backbone
or
'layer2'
not
in
name
and
'layer3'
not
in
name
and
'layer4'
not
in
name
:
parameter
.
requires_grad_
(
False
)
if
return_interm_layers
:
return_layers
=
{
"layer1"
:
"0"
,
"layer2"
:
"1"
,
"layer3"
:
"2"
,
"layer4"
:
"3"
}
# return_layers = {"layer2": "0", "layer3": "1", "layer4": "2"}
self
.
strides
=
[
8
,
16
,
32
]
self
.
num_channels
=
[
512
,
1024
,
2048
]
else
:
return_layers
=
{
'layer4'
:
"0"
}
self
.
strides
=
[
32
]
self
.
num_channels
=
[
2048
]
self
.
body
=
IntermediateLayerGetter
(
backbone
,
return_layers
=
return_layers
)
def
forward
(
self
,
tensor_list
:
NestedTensor
):
xs
=
self
.
body
(
tensor_list
.
tensors
)
out
:
Dict
[
str
,
NestedTensor
]
=
{}
for
name
,
x
in
xs
.
items
():
m
=
tensor_list
.
mask
assert
m
is
not
None
mask
=
F
.
interpolate
(
m
[
None
].
float
(),
size
=
x
.
shape
[
-
2
:]).
to
(
torch
.
bool
)[
0
]
out
[
name
]
=
NestedTensor
(
x
,
mask
)
return
out
class
Backbone
(
BackboneBase
):
"""ResNet backbone with frozen BatchNorm."""
def
__init__
(
self
,
name
:
str
,
train_backbone
:
bool
,
return_interm_layers
:
bool
,
dilation
:
bool
):
norm_layer
=
FrozenBatchNorm2d
backbone
=
getattr
(
torchvision
.
models
,
name
)(
replace_stride_with_dilation
=
[
False
,
False
,
dilation
],
pretrained
=
is_main_process
(),
norm_layer
=
norm_layer
)
assert
name
not
in
(
'resnet18'
,
'resnet34'
),
"number of channels are hard coded"
super
().
__init__
(
backbone
,
train_backbone
,
return_interm_layers
)
if
dilation
:
self
.
strides
[
-
1
]
=
self
.
strides
[
-
1
]
//
2
class
Joiner
(
nn
.
Sequential
):
def
__init__
(
self
,
backbone
,
position_embedding
):
super
().
__init__
(
backbone
,
position_embedding
)
self
.
strides
=
backbone
.
strides
self
.
num_channels
=
backbone
.
num_channels
def
forward
(
self
,
tensor_list
:
NestedTensor
):
xs
=
self
[
0
](
tensor_list
)
out
:
List
[
NestedTensor
]
=
[]
pos
=
[]
for
name
,
x
in
xs
.
items
():
out
.
append
(
x
)
# position encoding
for
x
in
out
:
pos
.
append
(
self
[
1
](
x
).
to
(
x
.
tensors
.
dtype
))
return
out
,
pos
def
build_backbone
(
args
):
position_embedding
=
build_position_encoding
(
args
)
train_backbone
=
args
.
lr_backbone
>
0
return_interm_layers
=
args
.
masks
or
(
args
.
num_feature_levels
>
1
)
backbone
=
Backbone
(
args
.
backbone
,
train_backbone
,
return_interm_layers
,
args
.
dilation
)
model
=
Joiner
(
backbone
,
position_embedding
)
return
model
projects_oss/detr/detr/models/deformable_detr.py
0 → 100644
View file @
f23248c0
# ------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------
# Modified from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# ------------------------------------------------------------------------
"""
Deformable DETR model and criterion classes.
"""
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
import
math
from
..util
import
box_ops
from
..util.misc
import
(
NestedTensor
,
nested_tensor_from_tensor_list
,
accuracy
,
get_world_size
,
interpolate
,
is_dist_avail_and_initialized
,
inverse_sigmoid
)
from
.backbone
import
build_backbone
from
.matcher
import
build_matcher
from
.segmentation
import
(
DETRsegm
,
PostProcessPanoptic
,
PostProcessSegm
,
dice_loss
,
sigmoid_focal_loss
)
from
.deformable_transformer
import
build_deforamble_transformer
import
copy
from
.setcriterion
import
FocalLossSetCriterion
def
_get_clones
(
module
,
N
):
return
nn
.
ModuleList
([
copy
.
deepcopy
(
module
)
for
i
in
range
(
N
)])
class
DeformableDETR
(
nn
.
Module
):
""" This is the Deformable DETR module that performs object detection """
def
__init__
(
self
,
backbone
,
transformer
,
num_classes
,
num_queries
,
num_feature_levels
,
aux_loss
=
True
,
with_box_refine
=
False
,
two_stage
=
False
):
""" Initializes the model.
Parameters:
backbone: torch module of the backbone to be used. See backbone.py
transformer: torch module of the transformer architecture. See transformer.py
num_classes: number of object classes
num_queries: number of object queries, ie detection slot. This is the maximal number of objects
DETR can detect in a single image. For COCO, we recommend 100 queries.
aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
with_box_refine: iterative bounding box refinement
two_stage: two-stage Deformable DETR
"""
super
().
__init__
()
self
.
num_queries
=
num_queries
self
.
transformer
=
transformer
hidden_dim
=
transformer
.
d_model
self
.
class_embed
=
nn
.
Linear
(
hidden_dim
,
num_classes
)
self
.
bbox_embed
=
MLP
(
hidden_dim
,
hidden_dim
,
4
,
3
)
self
.
num_feature_levels
=
num_feature_levels
if
not
two_stage
:
self
.
query_embed
=
nn
.
Embedding
(
num_queries
,
hidden_dim
*
2
)
if
num_feature_levels
>
1
:
num_backbone_outs
=
len
(
backbone
.
strides
)
input_proj_list
=
[]
for
_
in
range
(
num_backbone_outs
):
in_channels
=
backbone
.
num_channels
[
_
]
input_proj_list
.
append
(
nn
.
Sequential
(
nn
.
Conv2d
(
in_channels
,
hidden_dim
,
kernel_size
=
1
),
nn
.
GroupNorm
(
32
,
hidden_dim
),
))
for
_
in
range
(
num_feature_levels
-
num_backbone_outs
):
input_proj_list
.
append
(
nn
.
Sequential
(
nn
.
Conv2d
(
in_channels
,
hidden_dim
,
kernel_size
=
3
,
stride
=
2
,
padding
=
1
),
nn
.
GroupNorm
(
32
,
hidden_dim
),
))
in_channels
=
hidden_dim
self
.
input_proj
=
nn
.
ModuleList
(
input_proj_list
)
else
:
self
.
input_proj
=
nn
.
ModuleList
([
nn
.
Sequential
(
nn
.
Conv2d
(
backbone
.
num_channels
[
0
],
hidden_dim
,
kernel_size
=
1
),
nn
.
GroupNorm
(
32
,
hidden_dim
),
)])
self
.
backbone
=
backbone
self
.
aux_loss
=
aux_loss
self
.
with_box_refine
=
with_box_refine
self
.
two_stage
=
two_stage
prior_prob
=
0.01
bias_value
=
-
math
.
log
((
1
-
prior_prob
)
/
prior_prob
)
self
.
class_embed
.
bias
.
data
=
torch
.
ones
(
num_classes
)
*
bias_value
nn
.
init
.
constant_
(
self
.
bbox_embed
.
layers
[
-
1
].
weight
.
data
,
0
)
nn
.
init
.
constant_
(
self
.
bbox_embed
.
layers
[
-
1
].
bias
.
data
,
0
)
for
proj
in
self
.
input_proj
:
nn
.
init
.
xavier_uniform_
(
proj
[
0
].
weight
,
gain
=
1
)
nn
.
init
.
constant_
(
proj
[
0
].
bias
,
0
)
# if two-stage, the last class_embed and bbox_embed is for region proposal generation
num_pred
=
(
transformer
.
decoder
.
num_layers
+
1
)
if
two_stage
else
transformer
.
decoder
.
num_layers
if
with_box_refine
:
self
.
class_embed
=
_get_clones
(
self
.
class_embed
,
num_pred
)
self
.
bbox_embed
=
_get_clones
(
self
.
bbox_embed
,
num_pred
)
nn
.
init
.
constant_
(
self
.
bbox_embed
[
0
].
layers
[
-
1
].
bias
.
data
[
2
:],
-
2.0
)
# hack implementation for iterative bounding box refinement
self
.
transformer
.
decoder
.
bbox_embed
=
self
.
bbox_embed
else
:
nn
.
init
.
constant_
(
self
.
bbox_embed
.
layers
[
-
1
].
bias
.
data
[
2
:],
-
2.0
)
self
.
class_embed
=
nn
.
ModuleList
([
self
.
class_embed
for
_
in
range
(
num_pred
)])
self
.
bbox_embed
=
nn
.
ModuleList
([
self
.
bbox_embed
for
_
in
range
(
num_pred
)])
self
.
transformer
.
decoder
.
bbox_embed
=
None
if
two_stage
:
# hack implementation for two-stage
self
.
transformer
.
decoder
.
class_embed
=
self
.
class_embed
for
box_embed
in
self
.
bbox_embed
:
nn
.
init
.
constant_
(
box_embed
.
layers
[
-
1
].
bias
.
data
[
2
:],
0.0
)
def
forward
(
self
,
samples
:
NestedTensor
):
""" The forward expects a NestedTensor, which consists of:
- samples.tensor: batched images, of shape [batch_size x 3 x H x W]
- samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
It returns a dict with the following elements:
- "pred_logits": the classification logits (including no-object) for all queries.
Shape= [batch_size x num_queries x (num_classes + 1)]
- "pred_boxes": The normalized boxes coordinates for all queries, represented as
(center_x, center_y, height, width). These values are normalized in [0, 1],
relative to the size of each individual image (disregarding possible padding).
See PostProcess for information on how to retrieve the unnormalized bounding box.
- "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
dictionnaries containing the two above keys for each decoder layer.
"""
if
isinstance
(
samples
,
(
list
,
torch
.
Tensor
)):
samples
=
nested_tensor_from_tensor_list
(
samples
)
features
,
pos
=
self
.
backbone
(
samples
)
srcs
=
[]
masks
=
[]
for
l
,
feat
in
enumerate
(
features
):
src
,
mask
=
feat
.
decompose
()
srcs
.
append
(
self
.
input_proj
[
l
](
src
))
masks
.
append
(
mask
)
assert
mask
is
not
None
if
self
.
num_feature_levels
>
len
(
srcs
):
N
,
C
,
H
,
W
=
samples
.
tensor
.
size
()
sample_mask
=
torch
.
ones
((
N
,
H
,
W
),
dtype
=
torch
.
bool
,
device
=
src
.
device
)
for
idx
in
range
(
N
):
image_size
=
samples
.
image_sizes
[
idx
]
h
,
w
=
image_size
sample_mask
[
idx
,
:
h
,
:
w
]
=
False
sample_mask
=
sample_mask
[
None
].
float
()
_len_srcs
=
len
(
srcs
)
for
l
in
range
(
_len_srcs
,
self
.
num_feature_levels
):
if
l
==
_len_srcs
:
src
=
self
.
input_proj
[
l
](
features
[
-
1
].
tensors
)
else
:
src
=
self
.
input_proj
[
l
](
srcs
[
-
1
])
b
,
_
,
h
,
w
=
src
.
size
()
mask
=
F
.
interpolate
(
sample_mask
,
size
=
src
.
shape
[
-
2
:]).
to
(
torch
.
bool
)[
0
]
pos_l
=
self
.
backbone
[
1
](
NestedTensor
(
src
,
mask
)).
to
(
src
.
dtype
)
srcs
.
append
(
src
)
masks
.
append
(
mask
)
pos
.
append
(
pos_l
)
query_embeds
=
None
if
not
self
.
two_stage
:
query_embeds
=
self
.
query_embed
.
weight
hs
,
init_reference
,
inter_references
,
enc_outputs_class
,
enc_outputs_coord_unact
=
self
.
transformer
(
srcs
,
masks
,
pos
,
query_embeds
)
outputs_classes
=
[]
outputs_coords
=
[]
for
lvl
in
range
(
hs
.
shape
[
0
]):
if
lvl
==
0
:
reference
=
init_reference
else
:
reference
=
inter_references
[
lvl
-
1
]
reference
=
inverse_sigmoid
(
reference
)
outputs_class
=
self
.
class_embed
[
lvl
](
hs
[
lvl
])
tmp
=
self
.
bbox_embed
[
lvl
](
hs
[
lvl
])
if
reference
.
shape
[
-
1
]
==
4
:
tmp
+=
reference
else
:
assert
reference
.
shape
[
-
1
]
==
2
tmp
[...,
:
2
]
+=
reference
outputs_coord
=
tmp
.
sigmoid
()
outputs_classes
.
append
(
outputs_class
)
outputs_coords
.
append
(
outputs_coord
)
outputs_class
=
torch
.
stack
(
outputs_classes
)
outputs_coord
=
torch
.
stack
(
outputs_coords
)
out
=
{
'pred_logits'
:
outputs_class
[
-
1
],
'pred_boxes'
:
outputs_coord
[
-
1
]}
if
self
.
aux_loss
:
out
[
'aux_outputs'
]
=
self
.
_set_aux_loss
(
outputs_class
,
outputs_coord
)
if
self
.
two_stage
:
enc_outputs_coord
=
enc_outputs_coord_unact
.
sigmoid
()
out
[
'enc_outputs'
]
=
{
'pred_logits'
:
enc_outputs_class
,
'pred_boxes'
:
enc_outputs_coord
}
return
out
@
torch
.
jit
.
unused
def
_set_aux_loss
(
self
,
outputs_class
,
outputs_coord
):
# this is a workaround to make torchscript happy, as torchscript
# doesn't support dictionary with non-homogeneous values, such
# as a dict having both a Tensor and a list.
return
[{
'pred_logits'
:
a
,
'pred_boxes'
:
b
}
for
a
,
b
in
zip
(
outputs_class
[:
-
1
],
outputs_coord
[:
-
1
])]
class
PostProcess
(
nn
.
Module
):
""" This module converts the model's output into the format expected by the coco api"""
@
torch
.
no_grad
()
def
forward
(
self
,
outputs
,
target_sizes
):
""" Perform the computation
Parameters:
outputs: raw outputs of the model
target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
For evaluation, this must be the original image size (before any data augmentation)
For visualization, this should be the image size after data augment, but before padding
"""
out_logits
,
out_bbox
=
outputs
[
'pred_logits'
],
outputs
[
'pred_boxes'
]
assert
len
(
out_logits
)
==
len
(
target_sizes
)
assert
target_sizes
.
shape
[
1
]
==
2
prob
=
out_logits
.
sigmoid
()
topk_values
,
topk_indexes
=
torch
.
topk
(
prob
.
view
(
out_logits
.
shape
[
0
],
-
1
),
100
,
dim
=
1
)
scores
=
topk_values
topk_boxes
=
topk_indexes
//
out_logits
.
shape
[
2
]
labels
=
topk_indexes
%
out_logits
.
shape
[
2
]
boxes
=
box_ops
.
box_cxcywh_to_xyxy
(
out_bbox
)
boxes
=
torch
.
gather
(
boxes
,
1
,
topk_boxes
.
unsqueeze
(
-
1
).
repeat
(
1
,
1
,
4
))
# and from relative [0, 1] to absolute [0, height] coordinates
img_h
,
img_w
=
target_sizes
.
unbind
(
1
)
scale_fct
=
torch
.
stack
([
img_w
,
img_h
,
img_w
,
img_h
],
dim
=
1
)
boxes
=
boxes
*
scale_fct
[:,
None
,
:]
results
=
[{
'scores'
:
s
,
'labels'
:
l
,
'boxes'
:
b
}
for
s
,
l
,
b
in
zip
(
scores
,
labels
,
boxes
)]
return
results
class
MLP
(
nn
.
Module
):
""" Very simple multi-layer perceptron (also called FFN)"""
def
__init__
(
self
,
input_dim
,
hidden_dim
,
output_dim
,
num_layers
):
super
().
__init__
()
self
.
num_layers
=
num_layers
h
=
[
hidden_dim
]
*
(
num_layers
-
1
)
self
.
layers
=
nn
.
ModuleList
(
nn
.
Linear
(
n
,
k
)
for
n
,
k
in
zip
([
input_dim
]
+
h
,
h
+
[
output_dim
]))
def
forward
(
self
,
x
):
for
i
,
layer
in
enumerate
(
self
.
layers
):
x
=
F
.
relu
(
layer
(
x
))
if
i
<
self
.
num_layers
-
1
else
layer
(
x
)
return
x
def
build
(
args
):
num_classes
=
20
if
args
.
dataset_file
!=
'coco'
else
91
if
args
.
dataset_file
==
"coco_panoptic"
:
num_classes
=
250
device
=
torch
.
device
(
args
.
device
)
backbone
=
build_backbone
(
args
)
transformer
=
build_deforamble_transformer
(
args
)
model
=
DeformableDETR
(
backbone
,
transformer
,
num_classes
=
num_classes
,
num_queries
=
args
.
num_queries
,
num_feature_levels
=
args
.
num_feature_levels
,
aux_loss
=
args
.
aux_loss
,
with_box_refine
=
args
.
with_box_refine
,
two_stage
=
args
.
two_stage
,
)
if
args
.
masks
:
model
=
DETRsegm
(
model
,
freeze_detr
=
(
args
.
frozen_weights
is
not
None
))
matcher
=
build_matcher
(
args
)
weight_dict
=
{
'loss_ce'
:
args
.
cls_loss_coef
,
'loss_bbox'
:
args
.
bbox_loss_coef
}
weight_dict
[
'loss_giou'
]
=
args
.
giou_loss_coef
if
args
.
masks
:
weight_dict
[
"loss_mask"
]
=
args
.
mask_loss_coef
weight_dict
[
"loss_dice"
]
=
args
.
dice_loss_coef
# TODO this is a hack
if
args
.
aux_loss
:
aux_weight_dict
=
{}
for
i
in
range
(
args
.
dec_layers
-
1
):
aux_weight_dict
.
update
({
k
+
f
'_
{
i
}
'
:
v
for
k
,
v
in
weight_dict
.
items
()})
aux_weight_dict
.
update
({
k
+
f
'_enc'
:
v
for
k
,
v
in
weight_dict
.
items
()})
weight_dict
.
update
(
aux_weight_dict
)
losses
=
[
'labels'
,
'boxes'
,
'cardinality'
]
if
args
.
masks
:
losses
+=
[
"masks"
]
# num_classes, matcher, weight_dict, losses, focal_alpha=0.25
criterion
=
FocalLossSetCriterion
(
num_classes
,
matcher
,
weight_dict
,
losses
,
focal_alpha
=
args
.
focal_alpha
)
criterion
.
to
(
device
)
postprocessors
=
{
'bbox'
:
PostProcess
()}
if
args
.
masks
:
postprocessors
[
'segm'
]
=
PostProcessSegm
()
if
args
.
dataset_file
==
"coco_panoptic"
:
is_thing_map
=
{
i
:
i
<=
90
for
i
in
range
(
201
)}
postprocessors
[
"panoptic"
]
=
PostProcessPanoptic
(
is_thing_map
,
threshold
=
0.85
)
return
model
,
criterion
,
postprocessors
projects_oss/detr/detr/models/deformable_transformer.py
0 → 100644
View file @
f23248c0
# ------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------
# Modified from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# ------------------------------------------------------------------------
import
copy
from
typing
import
Optional
,
List
import
math
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
,
Tensor
from
torch.nn.init
import
xavier_uniform_
,
constant_
,
uniform_
,
normal_
from
..util.misc
import
inverse_sigmoid
from
..modules
import
MSDeformAttn
class
DeformableTransformer
(
nn
.
Module
):
def
__init__
(
self
,
d_model
=
256
,
nhead
=
8
,
num_encoder_layers
=
6
,
num_decoder_layers
=
6
,
dim_feedforward
=
1024
,
dropout
=
0.1
,
activation
=
"relu"
,
return_intermediate_dec
=
False
,
num_feature_levels
=
4
,
dec_n_points
=
4
,
enc_n_points
=
4
,
two_stage
=
False
,
two_stage_num_proposals
=
300
):
super
().
__init__
()
self
.
d_model
=
d_model
self
.
nhead
=
nhead
self
.
two_stage
=
two_stage
self
.
two_stage_num_proposals
=
two_stage_num_proposals
encoder_layer
=
DeformableTransformerEncoderLayer
(
d_model
,
dim_feedforward
,
dropout
,
activation
,
num_feature_levels
,
nhead
,
enc_n_points
)
self
.
encoder
=
DeformableTransformerEncoder
(
encoder_layer
,
num_encoder_layers
)
decoder_layer
=
DeformableTransformerDecoderLayer
(
d_model
,
dim_feedforward
,
dropout
,
activation
,
num_feature_levels
,
nhead
,
dec_n_points
)
self
.
decoder
=
DeformableTransformerDecoder
(
decoder_layer
,
num_decoder_layers
,
return_intermediate_dec
)
self
.
level_embed
=
nn
.
Parameter
(
torch
.
Tensor
(
num_feature_levels
,
d_model
))
if
two_stage
:
self
.
enc_output
=
nn
.
Linear
(
d_model
,
d_model
)
self
.
enc_output_norm
=
nn
.
LayerNorm
(
d_model
)
self
.
pos_trans
=
nn
.
Linear
(
d_model
*
2
,
d_model
*
2
)
self
.
pos_trans_norm
=
nn
.
LayerNorm
(
d_model
*
2
)
else
:
self
.
reference_points
=
nn
.
Linear
(
d_model
,
2
)
self
.
_reset_parameters
()
def
_reset_parameters
(
self
):
for
p
in
self
.
parameters
():
if
p
.
dim
()
>
1
:
nn
.
init
.
xavier_uniform_
(
p
)
for
m
in
self
.
modules
():
if
isinstance
(
m
,
MSDeformAttn
):
m
.
_reset_parameters
()
if
not
self
.
two_stage
:
xavier_uniform_
(
self
.
reference_points
.
weight
.
data
,
gain
=
1.0
)
constant_
(
self
.
reference_points
.
bias
.
data
,
0.
)
normal_
(
self
.
level_embed
)
def
get_proposal_pos_embed
(
self
,
proposals
):
num_pos_feats
=
128
temperature
=
10000
scale
=
2
*
math
.
pi
dim_t
=
torch
.
arange
(
num_pos_feats
,
dtype
=
torch
.
float32
,
device
=
proposals
.
device
)
dim_t
=
temperature
**
(
2
*
(
dim_t
//
2
)
/
num_pos_feats
)
# N, L, 4
proposals
=
proposals
.
sigmoid
()
*
scale
# N, L, 4, 128
pos
=
proposals
[:,
:,
:,
None
]
/
dim_t
# N, L, 4, 64, 2
pos
=
torch
.
stack
((
pos
[:,
:,
:,
0
::
2
].
sin
(),
pos
[:,
:,
:,
1
::
2
].
cos
()),
dim
=
4
).
flatten
(
2
)
return
pos
def
gen_encoder_output_proposals
(
self
,
memory
,
memory_padding_mask
,
spatial_shapes
):
N_
,
S_
,
C_
=
memory
.
shape
base_scale
=
4.0
proposals
=
[]
_cur
=
0
for
lvl
,
(
H_
,
W_
)
in
enumerate
(
spatial_shapes
):
mask_flatten_
=
memory_padding_mask
[:,
_cur
:(
_cur
+
H_
*
W_
)].
view
(
N_
,
H_
,
W_
,
1
)
valid_H
=
torch
.
sum
(
~
mask_flatten_
[:,
:,
0
,
0
],
1
)
valid_W
=
torch
.
sum
(
~
mask_flatten_
[:,
0
,
:,
0
],
1
)
grid_y
,
grid_x
=
torch
.
meshgrid
(
torch
.
linspace
(
0
,
H_
-
1
,
H_
,
dtype
=
torch
.
float32
,
device
=
memory
.
device
),
torch
.
linspace
(
0
,
W_
-
1
,
W_
,
dtype
=
torch
.
float32
,
device
=
memory
.
device
))
grid
=
torch
.
cat
([
grid_x
.
unsqueeze
(
-
1
),
grid_y
.
unsqueeze
(
-
1
)],
-
1
)
scale
=
torch
.
cat
([
valid_W
.
unsqueeze
(
-
1
),
valid_H
.
unsqueeze
(
-
1
)],
1
).
view
(
N_
,
1
,
1
,
2
)
grid
=
(
grid
.
unsqueeze
(
0
).
expand
(
N_
,
-
1
,
-
1
,
-
1
)
+
0.5
)
/
scale
wh
=
torch
.
ones_like
(
grid
)
*
0.05
*
(
2.0
**
lvl
)
proposal
=
torch
.
cat
((
grid
,
wh
),
-
1
).
view
(
N_
,
-
1
,
4
)
proposals
.
append
(
proposal
)
_cur
+=
(
H_
*
W_
)
output_proposals
=
torch
.
cat
(
proposals
,
1
)
output_proposals_valid
=
((
output_proposals
>
0.01
)
&
(
output_proposals
<
0.99
)).
all
(
-
1
,
keepdim
=
True
)
output_proposals
=
torch
.
log
(
output_proposals
/
(
1
-
output_proposals
))
output_proposals
=
output_proposals
.
masked_fill
(
memory_padding_mask
.
unsqueeze
(
-
1
),
float
(
'inf'
))
output_proposals
=
output_proposals
.
masked_fill
(
~
output_proposals_valid
,
float
(
'inf'
))
output_memory
=
memory
output_memory
=
output_memory
.
masked_fill
(
memory_padding_mask
.
unsqueeze
(
-
1
),
float
(
0
))
output_memory
=
output_memory
.
masked_fill
(
~
output_proposals_valid
,
float
(
0
))
output_memory
=
self
.
enc_output_norm
(
self
.
enc_output
(
output_memory
))
return
output_memory
,
output_proposals
def
get_valid_ratio
(
self
,
mask
):
_
,
H
,
W
=
mask
.
shape
valid_H
=
torch
.
sum
(
~
mask
[:,
:,
0
],
1
)
valid_W
=
torch
.
sum
(
~
mask
[:,
0
,
:],
1
)
valid_ratio_h
=
valid_H
.
float
()
/
H
valid_ratio_w
=
valid_W
.
float
()
/
W
valid_ratio
=
torch
.
stack
([
valid_ratio_w
,
valid_ratio_h
],
-
1
)
return
valid_ratio
def
forward
(
self
,
srcs
,
masks
,
pos_embeds
,
query_embed
=
None
):
assert
self
.
two_stage
or
query_embed
is
not
None
# prepare input for encoder
src_flatten
=
[]
mask_flatten
=
[]
lvl_pos_embed_flatten
=
[]
spatial_shapes
=
[]
for
lvl
,
(
src
,
mask
,
pos_embed
)
in
enumerate
(
zip
(
srcs
,
masks
,
pos_embeds
)):
bs
,
c
,
h
,
w
=
src
.
shape
spatial_shape
=
(
h
,
w
)
spatial_shapes
.
append
(
spatial_shape
)
src
=
src
.
flatten
(
2
).
transpose
(
1
,
2
)
mask
=
mask
.
flatten
(
1
)
pos_embed
=
pos_embed
.
flatten
(
2
).
transpose
(
1
,
2
)
lvl_pos_embed
=
pos_embed
+
self
.
level_embed
[
lvl
].
view
(
1
,
1
,
-
1
)
lvl_pos_embed_flatten
.
append
(
lvl_pos_embed
)
src_flatten
.
append
(
src
)
mask_flatten
.
append
(
mask
)
src_flatten
=
torch
.
cat
(
src_flatten
,
1
)
mask_flatten
=
torch
.
cat
(
mask_flatten
,
1
)
lvl_pos_embed_flatten
=
torch
.
cat
(
lvl_pos_embed_flatten
,
1
)
spatial_shapes
=
torch
.
as_tensor
(
spatial_shapes
,
dtype
=
torch
.
long
,
device
=
src_flatten
.
device
)
level_start_index
=
torch
.
cat
((
spatial_shapes
.
new_zeros
((
1
,
)),
spatial_shapes
.
prod
(
1
).
cumsum
(
0
)[:
-
1
]))
valid_ratios
=
torch
.
stack
([
self
.
get_valid_ratio
(
m
)
for
m
in
masks
],
1
)
# encoder
memory
=
self
.
encoder
(
src_flatten
,
spatial_shapes
,
level_start_index
,
valid_ratios
,
lvl_pos_embed_flatten
,
mask_flatten
)
# prepare input for decoder
bs
,
_
,
c
=
memory
.
shape
if
self
.
two_stage
:
output_memory
,
output_proposals
=
self
.
gen_encoder_output_proposals
(
memory
,
mask_flatten
,
spatial_shapes
)
# hack implementation for two-stage Deformable DETR
enc_outputs_class
=
self
.
decoder
.
class_embed
[
self
.
decoder
.
num_layers
](
output_memory
)
enc_outputs_coord_unact
=
self
.
decoder
.
bbox_embed
[
self
.
decoder
.
num_layers
](
output_memory
)
+
output_proposals
topk
=
self
.
two_stage_num_proposals
topk_proposals
=
torch
.
topk
(
enc_outputs_class
[...,
0
],
topk
,
dim
=
1
)[
1
]
topk_coords_unact
=
torch
.
gather
(
enc_outputs_coord_unact
,
1
,
topk_proposals
.
unsqueeze
(
-
1
).
repeat
(
1
,
1
,
4
))
topk_coords_unact
=
topk_coords_unact
.
detach
()
reference_points
=
topk_coords_unact
.
sigmoid
()
init_reference_out
=
reference_points
pos_trans_out
=
self
.
pos_trans_norm
(
self
.
pos_trans
(
self
.
get_proposal_pos_embed
(
topk_coords_unact
)))
query_embed
,
tgt
=
torch
.
split
(
pos_trans_out
,
c
,
dim
=
2
)
else
:
query_embed
,
tgt
=
torch
.
split
(
query_embed
,
c
,
dim
=
1
)
query_embed
=
query_embed
.
unsqueeze
(
0
).
expand
(
bs
,
-
1
,
-
1
)
tgt
=
tgt
.
unsqueeze
(
0
).
expand
(
bs
,
-
1
,
-
1
)
reference_points
=
self
.
reference_points
(
query_embed
).
sigmoid
()
init_reference_out
=
reference_points
# decoder
hs
,
inter_references
=
self
.
decoder
(
tgt
,
reference_points
,
memory
,
spatial_shapes
,
level_start_index
,
valid_ratios
,
query_embed
,
mask_flatten
)
inter_references_out
=
inter_references
if
self
.
two_stage
:
return
hs
,
init_reference_out
,
inter_references_out
,
enc_outputs_class
,
enc_outputs_coord_unact
return
hs
,
init_reference_out
,
inter_references_out
,
None
,
None
class
DeformableTransformerEncoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
d_model
=
256
,
d_ffn
=
1024
,
dropout
=
0.1
,
activation
=
"relu"
,
n_levels
=
4
,
n_heads
=
8
,
n_points
=
4
):
super
().
__init__
()
# self attention
self
.
self_attn
=
MSDeformAttn
(
d_model
,
n_levels
,
n_heads
,
n_points
)
self
.
dropout1
=
nn
.
Dropout
(
dropout
)
self
.
norm1
=
nn
.
LayerNorm
(
d_model
)
# ffn
self
.
linear1
=
nn
.
Linear
(
d_model
,
d_ffn
)
self
.
activation
=
_get_activation_fn
(
activation
)
self
.
dropout2
=
nn
.
Dropout
(
dropout
)
self
.
linear2
=
nn
.
Linear
(
d_ffn
,
d_model
)
self
.
dropout3
=
nn
.
Dropout
(
dropout
)
self
.
norm2
=
nn
.
LayerNorm
(
d_model
)
@
staticmethod
def
with_pos_embed
(
tensor
,
pos
):
return
tensor
if
pos
is
None
else
tensor
+
pos
def
forward_ffn
(
self
,
src
):
src2
=
self
.
linear2
(
self
.
dropout2
(
self
.
activation
(
self
.
linear1
(
src
))))
src
=
src
+
self
.
dropout3
(
src2
)
src
=
self
.
norm2
(
src
)
return
src
def
forward
(
self
,
src
,
pos
,
reference_points
,
spatial_shapes
,
level_start_index
,
padding_mask
=
None
):
# self attention
src2
=
self
.
self_attn
(
self
.
with_pos_embed
(
src
,
pos
),
reference_points
,
src
,
spatial_shapes
,
level_start_index
,
padding_mask
)
src
=
src
+
self
.
dropout1
(
src2
)
src
=
self
.
norm1
(
src
)
# ffn
src
=
self
.
forward_ffn
(
src
)
return
src
class
DeformableTransformerEncoder
(
nn
.
Module
):
def
__init__
(
self
,
encoder_layer
,
num_layers
):
super
().
__init__
()
self
.
layers
=
_get_clones
(
encoder_layer
,
num_layers
)
self
.
num_layers
=
num_layers
@
staticmethod
def
get_reference_points
(
spatial_shapes
,
valid_ratios
,
device
):
reference_points_list
=
[]
for
lvl
,
(
H_
,
W_
)
in
enumerate
(
spatial_shapes
):
ref_y
,
ref_x
=
torch
.
meshgrid
(
torch
.
linspace
(
0.5
,
H_
-
0.5
,
H_
,
dtype
=
torch
.
float32
,
device
=
device
),
torch
.
linspace
(
0.5
,
W_
-
0.5
,
W_
,
dtype
=
torch
.
float32
,
device
=
device
))
ref_y
=
ref_y
.
reshape
(
-
1
)[
None
]
/
(
valid_ratios
[:,
None
,
lvl
,
1
]
*
H_
)
ref_x
=
ref_x
.
reshape
(
-
1
)[
None
]
/
(
valid_ratios
[:,
None
,
lvl
,
0
]
*
W_
)
ref
=
torch
.
stack
((
ref_x
,
ref_y
),
-
1
)
reference_points_list
.
append
(
ref
)
reference_points
=
torch
.
cat
(
reference_points_list
,
1
)
reference_points
=
reference_points
[:,
:,
None
]
*
valid_ratios
[:,
None
]
return
reference_points
def
forward
(
self
,
src
,
spatial_shapes
,
level_start_index
,
valid_ratios
,
pos
=
None
,
padding_mask
=
None
):
output
=
src
reference_points
=
self
.
get_reference_points
(
spatial_shapes
,
valid_ratios
,
device
=
src
.
device
)
for
_
,
layer
in
enumerate
(
self
.
layers
):
output
=
layer
(
output
,
pos
,
reference_points
,
spatial_shapes
,
level_start_index
,
padding_mask
)
return
output
class
DeformableTransformerDecoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
d_model
=
256
,
d_ffn
=
1024
,
dropout
=
0.1
,
activation
=
"relu"
,
n_levels
=
4
,
n_heads
=
8
,
n_points
=
4
):
super
().
__init__
()
# cross attention
self
.
cross_attn
=
MSDeformAttn
(
d_model
,
n_levels
,
n_heads
,
n_points
)
self
.
dropout1
=
nn
.
Dropout
(
dropout
)
self
.
norm1
=
nn
.
LayerNorm
(
d_model
)
# self attention
self
.
self_attn
=
nn
.
MultiheadAttention
(
d_model
,
n_heads
,
dropout
=
dropout
)
self
.
dropout2
=
nn
.
Dropout
(
dropout
)
self
.
norm2
=
nn
.
LayerNorm
(
d_model
)
# ffn
self
.
linear1
=
nn
.
Linear
(
d_model
,
d_ffn
)
self
.
activation
=
_get_activation_fn
(
activation
)
self
.
dropout3
=
nn
.
Dropout
(
dropout
)
self
.
linear2
=
nn
.
Linear
(
d_ffn
,
d_model
)
self
.
dropout4
=
nn
.
Dropout
(
dropout
)
self
.
norm3
=
nn
.
LayerNorm
(
d_model
)
@
staticmethod
def
with_pos_embed
(
tensor
,
pos
):
return
tensor
if
pos
is
None
else
tensor
+
pos
def
forward_ffn
(
self
,
tgt
):
tgt2
=
self
.
linear2
(
self
.
dropout3
(
self
.
activation
(
self
.
linear1
(
tgt
))))
tgt
=
tgt
+
self
.
dropout4
(
tgt2
)
tgt
=
self
.
norm3
(
tgt
)
return
tgt
def
forward
(
self
,
tgt
,
query_pos
,
reference_points
,
src
,
src_spatial_shapes
,
level_start_index
,
src_padding_mask
=
None
):
# self attention
q
=
k
=
self
.
with_pos_embed
(
tgt
,
query_pos
)
tgt2
=
self
.
self_attn
(
q
.
transpose
(
0
,
1
),
k
.
transpose
(
0
,
1
),
tgt
.
transpose
(
0
,
1
))[
0
].
transpose
(
0
,
1
)
tgt
=
tgt
+
self
.
dropout2
(
tgt2
)
tgt
=
self
.
norm2
(
tgt
)
# cross attention
tgt2
=
self
.
cross_attn
(
self
.
with_pos_embed
(
tgt
,
query_pos
),
reference_points
,
src
,
src_spatial_shapes
,
level_start_index
,
src_padding_mask
)
tgt
=
tgt
+
self
.
dropout1
(
tgt2
)
tgt
=
self
.
norm1
(
tgt
)
# ffn
tgt
=
self
.
forward_ffn
(
tgt
)
return
tgt
class
DeformableTransformerDecoder
(
nn
.
Module
):
def
__init__
(
self
,
decoder_layer
,
num_layers
,
return_intermediate
=
False
):
super
().
__init__
()
self
.
layers
=
_get_clones
(
decoder_layer
,
num_layers
)
self
.
num_layers
=
num_layers
self
.
return_intermediate
=
return_intermediate
# hack implementation for iterative bounding box refinement and two-stage Deformable DETR
self
.
bbox_embed
=
None
self
.
class_embed
=
None
def
forward
(
self
,
tgt
,
reference_points
,
src
,
src_spatial_shapes
,
src_level_start_index
,
src_valid_ratios
,
query_pos
=
None
,
src_padding_mask
=
None
):
output
=
tgt
intermediate
=
[]
intermediate_reference_points
=
[]
for
lid
,
layer
in
enumerate
(
self
.
layers
):
if
reference_points
.
shape
[
-
1
]
==
4
:
reference_points_input
=
reference_points
[:,
:,
None
]
\
*
torch
.
cat
([
src_valid_ratios
,
src_valid_ratios
],
-
1
)[:,
None
]
else
:
assert
reference_points
.
shape
[
-
1
]
==
2
reference_points_input
=
reference_points
[:,
:,
None
]
*
src_valid_ratios
[:,
None
]
output
=
layer
(
output
,
query_pos
,
reference_points_input
,
src
,
src_spatial_shapes
,
src_level_start_index
,
src_padding_mask
)
# hack implementation for iterative bounding box refinement
if
self
.
bbox_embed
is
not
None
:
tmp
=
self
.
bbox_embed
[
lid
](
output
)
if
reference_points
.
shape
[
-
1
]
==
4
:
new_reference_points
=
tmp
+
inverse_sigmoid
(
reference_points
)
new_reference_points
=
new_reference_points
.
sigmoid
()
else
:
assert
reference_points
.
shape
[
-
1
]
==
2
new_reference_points
=
tmp
new_reference_points
[...,
:
2
]
=
tmp
[...,
:
2
]
+
inverse_sigmoid
(
reference_points
)
new_reference_points
=
new_reference_points
.
sigmoid
()
reference_points
=
new_reference_points
.
detach
()
if
self
.
return_intermediate
:
intermediate
.
append
(
output
)
intermediate_reference_points
.
append
(
reference_points
)
if
self
.
return_intermediate
:
return
torch
.
stack
(
intermediate
),
torch
.
stack
(
intermediate_reference_points
)
return
output
,
reference_points
def
_get_clones
(
module
,
N
):
return
nn
.
ModuleList
([
copy
.
deepcopy
(
module
)
for
i
in
range
(
N
)])
def
_get_activation_fn
(
activation
):
"""Return an activation function given a string"""
if
activation
==
"relu"
:
return
F
.
relu
if
activation
==
"gelu"
:
return
F
.
gelu
if
activation
==
"glu"
:
return
F
.
glu
raise
RuntimeError
(
F
"activation should be relu/gelu, not
{
activation
}
."
)
def
build_deforamble_transformer
(
args
):
return
DeformableTransformer
(
d_model
=
args
.
hidden_dim
,
nhead
=
args
.
nheads
,
num_encoder_layers
=
args
.
enc_layers
,
num_decoder_layers
=
args
.
dec_layers
,
dim_feedforward
=
args
.
dim_feedforward
,
dropout
=
args
.
dropout
,
activation
=
"relu"
,
return_intermediate_dec
=
True
,
num_feature_levels
=
args
.
num_feature_levels
,
dec_n_points
=
args
.
dec_n_points
,
enc_n_points
=
args
.
enc_n_points
,
two_stage
=
args
.
two_stage
,
two_stage_num_proposals
=
args
.
num_queries
)
projects_oss/detr/detr/models/detr.py
0 → 100644
View file @
f23248c0
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
DETR model and criterion classes.
"""
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
detr.util
import
box_ops
from
detr.util.misc
import
(
NestedTensor
,
nested_tensor_from_tensor_list
,
accuracy
,
get_world_size
,
interpolate
,
is_dist_avail_and_initialized
)
from
.backbone
import
build_backbone
from
.matcher
import
build_matcher
from
.segmentation
import
(
DETRsegm
,
PostProcessPanoptic
,
PostProcessSegm
,
dice_loss
,
sigmoid_focal_loss
)
from
.transformer
import
build_transformer
from
.setcriterion
import
SetCriterion
class
DETR
(
nn
.
Module
):
""" This is the DETR module that performs object detection """
def
__init__
(
self
,
backbone
,
transformer
,
num_classes
,
num_queries
,
aux_loss
=
False
,
use_focal_loss
=
False
):
""" Initializes the model.
Parameters:
backbone: torch module of the backbone to be used. See backbone.py
transformer: torch module of the transformer architecture. See transformer.py
num_classes: number of object classes
num_queries: number of object queries, ie detection slot. This is the maximal number of objects
DETR can detect in a single image. For COCO, we recommend 100 queries.
aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
"""
super
().
__init__
()
self
.
num_queries
=
num_queries
self
.
transformer
=
transformer
hidden_dim
=
transformer
.
d_model
self
.
class_embed
=
nn
.
Linear
(
hidden_dim
,
num_classes
if
use_focal_loss
else
num_classes
+
1
)
self
.
bbox_embed
=
MLP
(
hidden_dim
,
hidden_dim
,
4
,
3
)
self
.
query_embed
=
nn
.
Embedding
(
num_queries
,
hidden_dim
)
self
.
input_proj
=
nn
.
Conv2d
(
backbone
.
num_channels
[
-
1
],
hidden_dim
,
kernel_size
=
1
)
self
.
backbone
=
backbone
self
.
aux_loss
=
aux_loss
def
forward
(
self
,
samples
:
NestedTensor
):
""" The forward expects a NestedTensor, which consists of:
- samples.tensor: batched images, of shape [batch_size x 3 x H x W]
- samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
It returns a dict with the following elements:
- "pred_logits": the classification logits (including no-object) for all queries.
Shape= [batch_size x num_queries x (num_classes + 1)]
- "pred_boxes": The normalized boxes coordinates for all queries, represented as
(center_x, center_y, height, width). These values are normalized in [0, 1],
relative to the size of each individual image (disregarding possible padding).
See PostProcess for information on how to retrieve the unnormalized bounding box.
- "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
dictionnaries containing the two above keys for each decoder layer.
"""
if
isinstance
(
samples
,
(
list
,
torch
.
Tensor
)):
samples
=
nested_tensor_from_tensor_list
(
samples
)
features
,
pos
=
self
.
backbone
(
samples
)
src
,
mask
=
features
[
-
1
].
decompose
()
assert
mask
is
not
None
hs
=
self
.
transformer
(
self
.
input_proj
(
src
),
mask
,
self
.
query_embed
.
weight
,
pos
[
-
1
])[
0
]
outputs_class
=
self
.
class_embed
(
hs
)
outputs_coord
=
self
.
bbox_embed
(
hs
).
sigmoid
()
out
=
{
'pred_logits'
:
outputs_class
[
-
1
],
'pred_boxes'
:
outputs_coord
[
-
1
]}
if
self
.
aux_loss
:
out
[
'aux_outputs'
]
=
self
.
_set_aux_loss
(
outputs_class
,
outputs_coord
)
return
out
@
torch
.
jit
.
unused
def
_set_aux_loss
(
self
,
outputs_class
,
outputs_coord
):
# this is a workaround to make torchscript happy, as torchscript
# doesn't support dictionary with non-homogeneous values, such
# as a dict having both a Tensor and a list.
return
[{
'pred_logits'
:
a
,
'pred_boxes'
:
b
}
for
a
,
b
in
zip
(
outputs_class
[:
-
1
],
outputs_coord
[:
-
1
])]
class
PostProcess
(
nn
.
Module
):
""" This module converts the model's output into the format expected by the coco api"""
@
torch
.
no_grad
()
def
forward
(
self
,
outputs
,
target_sizes
):
""" Perform the computation
Parameters:
outputs: raw outputs of the model
target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
For evaluation, this must be the original image size (before any data augmentation)
For visualization, this should be the image size after data augment, but before padding
"""
out_logits
,
out_bbox
=
outputs
[
'pred_logits'
],
outputs
[
'pred_boxes'
]
assert
len
(
out_logits
)
==
len
(
target_sizes
)
assert
target_sizes
.
shape
[
1
]
==
2
prob
=
F
.
softmax
(
out_logits
,
-
1
)
scores
,
labels
=
prob
[...,
:
-
1
].
max
(
-
1
)
# convert to [x0, y0, x1, y1] format
boxes
=
box_ops
.
box_cxcywh_to_xyxy
(
out_bbox
)
# and from relative [0, 1] to absolute [0, height] coordinates
img_h
,
img_w
=
target_sizes
.
unbind
(
1
)
scale_fct
=
torch
.
stack
([
img_w
,
img_h
,
img_w
,
img_h
],
dim
=
1
)
boxes
=
boxes
*
scale_fct
[:,
None
,
:]
results
=
[{
'scores'
:
s
,
'labels'
:
l
,
'boxes'
:
b
}
for
s
,
l
,
b
in
zip
(
scores
,
labels
,
boxes
)]
return
results
class
MLP
(
nn
.
Module
):
""" Very simple multi-layer perceptron (also called FFN)"""
def
__init__
(
self
,
input_dim
,
hidden_dim
,
output_dim
,
num_layers
):
super
().
__init__
()
self
.
num_layers
=
num_layers
h
=
[
hidden_dim
]
*
(
num_layers
-
1
)
self
.
layers
=
nn
.
ModuleList
(
nn
.
Linear
(
n
,
k
)
for
n
,
k
in
zip
([
input_dim
]
+
h
,
h
+
[
output_dim
]))
def
forward
(
self
,
x
):
for
i
,
layer
in
enumerate
(
self
.
layers
):
x
=
F
.
relu
(
layer
(
x
))
if
i
<
self
.
num_layers
-
1
else
layer
(
x
)
return
x
def
build
(
args
):
# the `num_classes` naming here is somewhat misleading.
# it indeed corresponds to `max_obj_id + 1`, where max_obj_id
# is the maximum id for a class in your dataset. For example,
# COCO has a max_obj_id of 90, so we pass `num_classes` to be 91.
# As another example, for a dataset that has a single class with id 1,
# you should pass `num_classes` to be 2 (max_obj_id + 1).
# For more details on this, check the following discussion
# https://github.com/facebookresearch/detr/issues/108#issuecomment-650269223
num_classes
=
20
if
args
.
dataset_file
!=
'coco'
else
91
if
args
.
dataset_file
==
"coco_panoptic"
:
# for panoptic, we just add a num_classes that is large enough to hold
# max_obj_id + 1, but the exact value doesn't really matter
num_classes
=
250
device
=
torch
.
device
(
args
.
device
)
backbone
=
build_backbone
(
args
)
transformer
=
build_transformer
(
args
)
model
=
DETR
(
backbone
,
transformer
,
num_classes
=
num_classes
,
num_queries
=
args
.
num_queries
,
aux_loss
=
args
.
aux_loss
,
)
if
args
.
masks
:
model
=
DETRsegm
(
model
,
freeze_detr
=
(
args
.
frozen_weights
is
not
None
))
matcher
=
build_matcher
(
args
)
weight_dict
=
{
'loss_ce'
:
1
,
'loss_bbox'
:
args
.
bbox_loss_coef
}
weight_dict
[
'loss_giou'
]
=
args
.
giou_loss_coef
if
args
.
masks
:
weight_dict
[
"loss_mask"
]
=
args
.
mask_loss_coef
weight_dict
[
"loss_dice"
]
=
args
.
dice_loss_coef
# TODO this is a hack
if
args
.
aux_loss
:
aux_weight_dict
=
{}
for
i
in
range
(
args
.
dec_layers
-
1
):
aux_weight_dict
.
update
({
k
+
f
'_
{
i
}
'
:
v
for
k
,
v
in
weight_dict
.
items
()})
weight_dict
.
update
(
aux_weight_dict
)
losses
=
[
'labels'
,
'boxes'
,
'cardinality'
]
if
args
.
masks
:
losses
+=
[
"masks"
]
criterion
=
SetCriterion
(
num_classes
,
matcher
=
matcher
,
weight_dict
=
weight_dict
,
eos_coef
=
args
.
eos_coef
,
losses
=
losses
)
criterion
.
to
(
device
)
postprocessors
=
{
'bbox'
:
PostProcess
()}
if
args
.
masks
:
postprocessors
[
'segm'
]
=
PostProcessSegm
()
if
args
.
dataset_file
==
"coco_panoptic"
:
is_thing_map
=
{
i
:
i
<=
90
for
i
in
range
(
201
)}
postprocessors
[
"panoptic"
]
=
PostProcessPanoptic
(
is_thing_map
,
threshold
=
0.85
)
return
model
,
criterion
,
postprocessors
projects_oss/detr/detr/models/matcher.py
0 → 100644
View file @
f23248c0
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
Modules to compute the matching cost and solve the corresponding LSAP.
"""
import
torch
from
scipy.optimize
import
linear_sum_assignment
from
torch
import
nn
from
detr.util.box_ops
import
box_cxcywh_to_xyxy
,
generalized_box_iou
class
HungarianMatcher
(
nn
.
Module
):
"""This class computes an assignment between the targets and the predictions of the network
For efficiency reasons, the targets don't include the no_object. Because of this, in general,
there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
while the others are un-matched (and thus treated as non-objects).
"""
def
__init__
(
self
,
cost_class
:
float
=
1
,
cost_bbox
:
float
=
1
,
cost_giou
:
float
=
1
,
use_focal_loss
=
False
):
"""Creates the matcher
Params:
cost_class: This is the relative weight of the classification error in the matching cost
cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
"""
super
().
__init__
()
self
.
cost_class
=
cost_class
self
.
cost_bbox
=
cost_bbox
self
.
cost_giou
=
cost_giou
assert
cost_class
!=
0
or
cost_bbox
!=
0
or
cost_giou
!=
0
,
"all costs cant be 0"
self
.
use_focal_loss
=
use_focal_loss
@
torch
.
no_grad
()
def
forward
(
self
,
outputs
,
targets
):
""" Performs the matching
Params:
outputs: This is a dict that contains at least these entries:
"pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
"pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
"labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
objects in the target) containing the class labels
"boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
Returns:
A list of size batch_size, containing tuples of (index_i, index_j) where:
- index_i is the indices of the selected predictions (in order)
- index_j is the indices of the corresponding selected targets (in order)
For each batch element, it holds:
len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
"""
bs
,
num_queries
=
outputs
[
"pred_logits"
].
shape
[:
2
]
# We flatten to compute the cost matrices in a batch
if
self
.
use_focal_loss
:
out_prob
=
outputs
[
"pred_logits"
].
flatten
(
0
,
1
).
sigmoid
()
else
:
out_prob
=
outputs
[
"pred_logits"
].
flatten
(
0
,
1
).
softmax
(
-
1
)
# [batch_size * num_queries, num_classes]
out_bbox
=
outputs
[
"pred_boxes"
].
flatten
(
0
,
1
)
# [batch_size * num_queries, 4]
# Also concat the target labels and boxes
tgt_ids
=
torch
.
cat
([
v
[
"labels"
]
for
v
in
targets
])
tgt_bbox
=
torch
.
cat
([
v
[
"boxes"
]
for
v
in
targets
])
# Compute the classification cost. Contrary to the loss, we don't use the NLL,
# but approximate it in 1 - proba[target class].
# The 1 is a constant that doesn't change the matching, it can be omitted.
if
self
.
use_focal_loss
:
alpha
=
0.25
gamma
=
2.0
neg_cost_class
=
(
1
-
alpha
)
*
(
out_prob
**
gamma
)
*
(
-
(
1
-
out_prob
+
1e-8
).
log
())
pos_cost_class
=
alpha
*
((
1
-
out_prob
)
**
gamma
)
*
(
-
(
out_prob
+
1e-8
).
log
())
cost_class
=
pos_cost_class
[:,
tgt_ids
]
-
neg_cost_class
[:,
tgt_ids
]
else
:
cost_class
=
-
out_prob
[:,
tgt_ids
]
# Compute the L1 cost between boxes
cost_bbox
=
torch
.
cdist
(
out_bbox
,
tgt_bbox
,
p
=
1
)
# Compute the giou cost betwen boxes
cost_giou
=
-
generalized_box_iou
(
box_cxcywh_to_xyxy
(
out_bbox
),
box_cxcywh_to_xyxy
(
tgt_bbox
))
# Final cost matrix
C
=
self
.
cost_bbox
*
cost_bbox
+
self
.
cost_class
*
cost_class
+
self
.
cost_giou
*
cost_giou
C
=
C
.
view
(
bs
,
num_queries
,
-
1
).
cpu
()
sizes
=
[
len
(
v
[
"boxes"
])
for
v
in
targets
]
indices
=
[
linear_sum_assignment
(
c
[
i
])
for
i
,
c
in
enumerate
(
C
.
split
(
sizes
,
-
1
))]
return
[(
torch
.
as_tensor
(
i
,
dtype
=
torch
.
int64
),
torch
.
as_tensor
(
j
,
dtype
=
torch
.
int64
))
for
i
,
j
in
indices
]
def
build_matcher
(
args
):
return
HungarianMatcher
(
cost_class
=
args
.
set_cost_class
,
cost_bbox
=
args
.
set_cost_bbox
,
cost_giou
=
args
.
set_cost_giou
)
projects_oss/detr/detr/models/position_encoding.py
0 → 100644
View file @
f23248c0
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
Various positional encodings for the transformer.
"""
import
math
import
torch
from
torch
import
nn
from
detr.util.misc
import
NestedTensor
class
PositionEmbeddingSine
(
nn
.
Module
):
"""
This is a more standard version of the position embedding, very similar to the one
used by the Attention is all you need paper, generalized to work on images.
"""
def
__init__
(
self
,
num_pos_feats
=
64
,
temperature
=
10000
,
normalize
=
False
,
scale
=
None
,
centered
=
False
):
super
().
__init__
()
self
.
num_pos_feats
=
num_pos_feats
self
.
temperature
=
temperature
self
.
normalize
=
normalize
if
scale
is
not
None
and
normalize
is
False
:
raise
ValueError
(
"normalize should be True if scale is passed"
)
if
scale
is
None
:
scale
=
2
*
math
.
pi
self
.
scale
=
scale
self
.
centered
=
centered
def
forward
(
self
,
tensor_list
:
NestedTensor
):
x
=
tensor_list
.
tensors
mask
=
tensor_list
.
mask
assert
mask
is
not
None
not_mask
=
~
mask
y_embed
=
not_mask
.
cumsum
(
1
,
dtype
=
torch
.
float32
)
x_embed
=
not_mask
.
cumsum
(
2
,
dtype
=
torch
.
float32
)
if
self
.
normalize
:
eps
=
1e-6
if
self
.
centered
:
y_embed
=
(
y_embed
-
0.5
)
/
(
y_embed
[:,
-
1
:,
:]
+
eps
)
*
self
.
scale
x_embed
=
(
x_embed
-
0.5
)
/
(
x_embed
[:,
:,
-
1
:]
+
eps
)
*
self
.
scale
else
:
y_embed
=
y_embed
/
(
y_embed
[:,
-
1
:,
:]
+
eps
)
*
self
.
scale
x_embed
=
x_embed
/
(
x_embed
[:,
:,
-
1
:]
+
eps
)
*
self
.
scale
dim_t
=
torch
.
arange
(
self
.
num_pos_feats
,
dtype
=
torch
.
float32
,
device
=
x
.
device
)
dim_t
=
self
.
temperature
**
(
2
*
(
dim_t
//
2
)
/
self
.
num_pos_feats
)
pos_x
=
x_embed
[:,
:,
:,
None
]
/
dim_t
pos_y
=
y_embed
[:,
:,
:,
None
]
/
dim_t
pos_x
=
torch
.
stack
((
pos_x
[:,
:,
:,
0
::
2
].
sin
(),
pos_x
[:,
:,
:,
1
::
2
].
cos
()),
dim
=
4
).
flatten
(
3
)
pos_y
=
torch
.
stack
((
pos_y
[:,
:,
:,
0
::
2
].
sin
(),
pos_y
[:,
:,
:,
1
::
2
].
cos
()),
dim
=
4
).
flatten
(
3
)
pos
=
torch
.
cat
((
pos_y
,
pos_x
),
dim
=
3
).
permute
(
0
,
3
,
1
,
2
)
return
pos
class
PositionEmbeddingLearned
(
nn
.
Module
):
"""
Absolute pos embedding, learned.
"""
def
__init__
(
self
,
num_pos_feats
=
256
):
super
().
__init__
()
self
.
row_embed
=
nn
.
Embedding
(
50
,
num_pos_feats
)
self
.
col_embed
=
nn
.
Embedding
(
50
,
num_pos_feats
)
self
.
reset_parameters
()
def
reset_parameters
(
self
):
nn
.
init
.
uniform_
(
self
.
row_embed
.
weight
)
nn
.
init
.
uniform_
(
self
.
col_embed
.
weight
)
def
forward
(
self
,
tensor_list
:
NestedTensor
):
x
=
tensor_list
.
tensors
h
,
w
=
x
.
shape
[
-
2
:]
i
=
torch
.
arange
(
w
,
device
=
x
.
device
)
j
=
torch
.
arange
(
h
,
device
=
x
.
device
)
x_emb
=
self
.
col_embed
(
i
)
y_emb
=
self
.
row_embed
(
j
)
pos
=
torch
.
cat
([
x_emb
.
unsqueeze
(
0
).
repeat
(
h
,
1
,
1
),
y_emb
.
unsqueeze
(
1
).
repeat
(
1
,
w
,
1
),
],
dim
=-
1
).
permute
(
2
,
0
,
1
).
unsqueeze
(
0
).
repeat
(
x
.
shape
[
0
],
1
,
1
,
1
)
return
pos
def
build_position_encoding
(
args
):
N_steps
=
args
.
hidden_dim
//
2
if
args
.
position_embedding
in
(
'v2'
,
'sine'
):
# TODO find a better way of exposing other arguments
position_embedding
=
PositionEmbeddingSine
(
N_steps
,
normalize
=
True
)
elif
args
.
position_embedding
in
(
'v3'
,
'learned'
):
position_embedding
=
PositionEmbeddingLearned
(
N_steps
)
else
:
raise
ValueError
(
f
"not supported
{
args
.
position_embedding
}
"
)
return
position_embedding
projects_oss/detr/detr/models/segmentation.py
0 → 100644
View file @
f23248c0
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
This file provides the definition of the convolutional heads used to predict masks, as well as the losses
"""
import
io
from
collections
import
defaultdict
from
typing
import
List
,
Optional
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
torch
import
Tensor
from
PIL
import
Image
import
detr.util.box_ops
as
box_ops
from
detr.util.misc
import
NestedTensor
,
interpolate
,
nested_tensor_from_tensor_list
try
:
from
panopticapi.utils
import
id2rgb
,
rgb2id
except
ImportError
:
pass
class
DETRsegm
(
nn
.
Module
):
def
__init__
(
self
,
detr
,
freeze_detr
=
False
):
super
().
__init__
()
self
.
detr
=
detr
if
freeze_detr
:
for
p
in
self
.
parameters
():
p
.
requires_grad_
(
False
)
hidden_dim
,
nheads
=
detr
.
transformer
.
d_model
,
detr
.
transformer
.
nhead
self
.
bbox_attention
=
MHAttentionMap
(
hidden_dim
,
hidden_dim
,
nheads
,
dropout
=
0.0
)
self
.
mask_head
=
MaskHeadSmallConv
(
hidden_dim
+
nheads
,
[
1024
,
512
,
256
],
hidden_dim
)
def
forward
(
self
,
samples
:
NestedTensor
):
if
isinstance
(
samples
,
(
list
,
torch
.
Tensor
)):
samples
=
nested_tensor_from_tensor_list
(
samples
)
features
,
pos
=
self
.
detr
.
backbone
(
samples
)
bs
=
features
[
-
1
].
tensors
.
shape
[
0
]
src
,
mask
=
features
[
-
1
].
decompose
()
assert
mask
is
not
None
src_proj
=
self
.
detr
.
input_proj
(
src
)
hs
,
memory
=
self
.
detr
.
transformer
(
src_proj
,
mask
,
self
.
detr
.
query_embed
.
weight
,
pos
[
-
1
])
outputs_class
=
self
.
detr
.
class_embed
(
hs
)
outputs_coord
=
self
.
detr
.
bbox_embed
(
hs
).
sigmoid
()
out
=
{
"pred_logits"
:
outputs_class
[
-
1
],
"pred_boxes"
:
outputs_coord
[
-
1
]}
if
self
.
detr
.
aux_loss
:
out
[
'aux_outputs'
]
=
self
.
detr
.
_set_aux_loss
(
outputs_class
,
outputs_coord
)
# FIXME h_boxes takes the last one computed, keep this in mind
bbox_mask
=
self
.
bbox_attention
(
hs
[
-
1
],
memory
,
mask
=
mask
)
seg_masks
=
self
.
mask_head
(
src_proj
,
bbox_mask
,
[
features
[
2
].
tensors
,
features
[
1
].
tensors
,
features
[
0
].
tensors
])
outputs_seg_masks
=
seg_masks
.
view
(
bs
,
self
.
detr
.
num_queries
,
seg_masks
.
shape
[
-
2
],
seg_masks
.
shape
[
-
1
])
out
[
"pred_masks"
]
=
outputs_seg_masks
return
out
def
_expand
(
tensor
,
length
:
int
):
return
tensor
.
unsqueeze
(
1
).
repeat
(
1
,
int
(
length
),
1
,
1
,
1
).
flatten
(
0
,
1
)
class
MaskHeadSmallConv
(
nn
.
Module
):
"""
Simple convolutional head, using group norm.
Upsampling is done using a FPN approach
"""
def
__init__
(
self
,
dim
,
fpn_dims
,
context_dim
):
super
().
__init__
()
inter_dims
=
[
dim
,
context_dim
//
2
,
context_dim
//
4
,
context_dim
//
8
,
context_dim
//
16
,
context_dim
//
64
]
self
.
lay1
=
torch
.
nn
.
Conv2d
(
dim
,
dim
,
3
,
padding
=
1
)
self
.
gn1
=
torch
.
nn
.
GroupNorm
(
8
,
dim
)
self
.
lay2
=
torch
.
nn
.
Conv2d
(
dim
,
inter_dims
[
1
],
3
,
padding
=
1
)
self
.
gn2
=
torch
.
nn
.
GroupNorm
(
8
,
inter_dims
[
1
])
self
.
lay3
=
torch
.
nn
.
Conv2d
(
inter_dims
[
1
],
inter_dims
[
2
],
3
,
padding
=
1
)
self
.
gn3
=
torch
.
nn
.
GroupNorm
(
8
,
inter_dims
[
2
])
self
.
lay4
=
torch
.
nn
.
Conv2d
(
inter_dims
[
2
],
inter_dims
[
3
],
3
,
padding
=
1
)
self
.
gn4
=
torch
.
nn
.
GroupNorm
(
8
,
inter_dims
[
3
])
self
.
lay5
=
torch
.
nn
.
Conv2d
(
inter_dims
[
3
],
inter_dims
[
4
],
3
,
padding
=
1
)
self
.
gn5
=
torch
.
nn
.
GroupNorm
(
8
,
inter_dims
[
4
])
self
.
out_lay
=
torch
.
nn
.
Conv2d
(
inter_dims
[
4
],
1
,
3
,
padding
=
1
)
self
.
dim
=
dim
self
.
adapter1
=
torch
.
nn
.
Conv2d
(
fpn_dims
[
0
],
inter_dims
[
1
],
1
)
self
.
adapter2
=
torch
.
nn
.
Conv2d
(
fpn_dims
[
1
],
inter_dims
[
2
],
1
)
self
.
adapter3
=
torch
.
nn
.
Conv2d
(
fpn_dims
[
2
],
inter_dims
[
3
],
1
)
for
m
in
self
.
modules
():
if
isinstance
(
m
,
nn
.
Conv2d
):
nn
.
init
.
kaiming_uniform_
(
m
.
weight
,
a
=
1
)
nn
.
init
.
constant_
(
m
.
bias
,
0
)
def
forward
(
self
,
x
:
Tensor
,
bbox_mask
:
Tensor
,
fpns
:
List
[
Tensor
]):
x
=
torch
.
cat
([
_expand
(
x
,
bbox_mask
.
shape
[
1
]),
bbox_mask
.
flatten
(
0
,
1
)],
1
)
x
=
self
.
lay1
(
x
)
x
=
self
.
gn1
(
x
)
x
=
F
.
relu
(
x
)
x
=
self
.
lay2
(
x
)
x
=
self
.
gn2
(
x
)
x
=
F
.
relu
(
x
)
cur_fpn
=
self
.
adapter1
(
fpns
[
0
])
if
cur_fpn
.
size
(
0
)
!=
x
.
size
(
0
):
cur_fpn
=
_expand
(
cur_fpn
,
x
.
size
(
0
)
//
cur_fpn
.
size
(
0
))
x
=
cur_fpn
+
F
.
interpolate
(
x
,
size
=
cur_fpn
.
shape
[
-
2
:],
mode
=
"nearest"
)
x
=
self
.
lay3
(
x
)
x
=
self
.
gn3
(
x
)
x
=
F
.
relu
(
x
)
cur_fpn
=
self
.
adapter2
(
fpns
[
1
])
if
cur_fpn
.
size
(
0
)
!=
x
.
size
(
0
):
cur_fpn
=
_expand
(
cur_fpn
,
x
.
size
(
0
)
//
cur_fpn
.
size
(
0
))
x
=
cur_fpn
+
F
.
interpolate
(
x
,
size
=
cur_fpn
.
shape
[
-
2
:],
mode
=
"nearest"
)
x
=
self
.
lay4
(
x
)
x
=
self
.
gn4
(
x
)
x
=
F
.
relu
(
x
)
cur_fpn
=
self
.
adapter3
(
fpns
[
2
])
if
cur_fpn
.
size
(
0
)
!=
x
.
size
(
0
):
cur_fpn
=
_expand
(
cur_fpn
,
x
.
size
(
0
)
//
cur_fpn
.
size
(
0
))
x
=
cur_fpn
+
F
.
interpolate
(
x
,
size
=
cur_fpn
.
shape
[
-
2
:],
mode
=
"nearest"
)
x
=
self
.
lay5
(
x
)
x
=
self
.
gn5
(
x
)
x
=
F
.
relu
(
x
)
x
=
self
.
out_lay
(
x
)
return
x
class
MHAttentionMap
(
nn
.
Module
):
"""This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""
def
__init__
(
self
,
query_dim
,
hidden_dim
,
num_heads
,
dropout
=
0.0
,
bias
=
True
):
super
().
__init__
()
self
.
num_heads
=
num_heads
self
.
hidden_dim
=
hidden_dim
self
.
dropout
=
nn
.
Dropout
(
dropout
)
self
.
q_linear
=
nn
.
Linear
(
query_dim
,
hidden_dim
,
bias
=
bias
)
self
.
k_linear
=
nn
.
Linear
(
query_dim
,
hidden_dim
,
bias
=
bias
)
nn
.
init
.
zeros_
(
self
.
k_linear
.
bias
)
nn
.
init
.
zeros_
(
self
.
q_linear
.
bias
)
nn
.
init
.
xavier_uniform_
(
self
.
k_linear
.
weight
)
nn
.
init
.
xavier_uniform_
(
self
.
q_linear
.
weight
)
self
.
normalize_fact
=
float
(
hidden_dim
/
self
.
num_heads
)
**
-
0.5
def
forward
(
self
,
q
,
k
,
mask
:
Optional
[
Tensor
]
=
None
):
q
=
self
.
q_linear
(
q
)
k
=
F
.
conv2d
(
k
,
self
.
k_linear
.
weight
.
unsqueeze
(
-
1
).
unsqueeze
(
-
1
),
self
.
k_linear
.
bias
)
qh
=
q
.
view
(
q
.
shape
[
0
],
q
.
shape
[
1
],
self
.
num_heads
,
self
.
hidden_dim
//
self
.
num_heads
)
kh
=
k
.
view
(
k
.
shape
[
0
],
self
.
num_heads
,
self
.
hidden_dim
//
self
.
num_heads
,
k
.
shape
[
-
2
],
k
.
shape
[
-
1
])
weights
=
torch
.
einsum
(
"bqnc,bnchw->bqnhw"
,
qh
*
self
.
normalize_fact
,
kh
)
if
mask
is
not
None
:
weights
.
masked_fill_
(
mask
.
unsqueeze
(
1
).
unsqueeze
(
1
),
float
(
"-inf"
))
weights
=
F
.
softmax
(
weights
.
flatten
(
2
),
dim
=-
1
).
view
(
weights
.
size
())
weights
=
self
.
dropout
(
weights
)
return
weights
def
dice_loss
(
inputs
,
targets
,
num_boxes
):
"""
Compute the DICE loss, similar to generalized IOU for masks
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs
(0 for the negative class and 1 for the positive class).
"""
inputs
=
inputs
.
sigmoid
()
inputs
=
inputs
.
flatten
(
1
)
numerator
=
2
*
(
inputs
*
targets
).
sum
(
1
)
denominator
=
inputs
.
sum
(
-
1
)
+
targets
.
sum
(
-
1
)
loss
=
1
-
(
numerator
+
1
)
/
(
denominator
+
1
)
return
loss
.
sum
()
/
num_boxes
def
sigmoid_focal_loss
(
inputs
,
targets
,
num_boxes
,
alpha
:
float
=
0.25
,
gamma
:
float
=
2
):
"""
Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs
(0 for the negative class and 1 for the positive class).
alpha: (optional) Weighting factor in range (0,1) to balance
positive vs negative examples. Default = -1 (no weighting).
gamma: Exponent of the modulating factor (1 - p_t) to
balance easy vs hard examples.
Returns:
Loss tensor
"""
prob
=
inputs
.
sigmoid
()
ce_loss
=
F
.
binary_cross_entropy_with_logits
(
inputs
,
targets
,
reduction
=
"none"
)
p_t
=
prob
*
targets
+
(
1
-
prob
)
*
(
1
-
targets
)
loss
=
ce_loss
*
((
1
-
p_t
)
**
gamma
)
if
alpha
>=
0
:
alpha_t
=
alpha
*
targets
+
(
1
-
alpha
)
*
(
1
-
targets
)
loss
=
alpha_t
*
loss
return
loss
.
mean
(
1
).
sum
()
/
num_boxes
class
PostProcessSegm
(
nn
.
Module
):
def
__init__
(
self
,
threshold
=
0.5
):
super
().
__init__
()
self
.
threshold
=
threshold
@
torch
.
no_grad
()
def
forward
(
self
,
results
,
outputs
,
orig_target_sizes
,
max_target_sizes
):
assert
len
(
orig_target_sizes
)
==
len
(
max_target_sizes
)
max_h
,
max_w
=
max_target_sizes
.
max
(
0
)[
0
].
tolist
()
outputs_masks
=
outputs
[
"pred_masks"
].
squeeze
(
2
)
outputs_masks
=
F
.
interpolate
(
outputs_masks
,
size
=
(
max_h
,
max_w
),
mode
=
"bilinear"
,
align_corners
=
False
)
outputs_masks
=
(
outputs_masks
.
sigmoid
()
>
self
.
threshold
).
cpu
()
for
i
,
(
cur_mask
,
t
,
tt
)
in
enumerate
(
zip
(
outputs_masks
,
max_target_sizes
,
orig_target_sizes
)):
img_h
,
img_w
=
t
[
0
],
t
[
1
]
results
[
i
][
"masks"
]
=
cur_mask
[:,
:
img_h
,
:
img_w
].
unsqueeze
(
1
)
results
[
i
][
"masks"
]
=
F
.
interpolate
(
results
[
i
][
"masks"
].
float
(),
size
=
tuple
(
tt
.
tolist
()),
mode
=
"nearest"
).
byte
()
return
results
class
PostProcessPanoptic
(
nn
.
Module
):
"""This class converts the output of the model to the final panoptic result, in the format expected by the
coco panoptic API """
def
__init__
(
self
,
is_thing_map
,
threshold
=
0.85
):
"""
Parameters:
is_thing_map: This is a whose keys are the class ids, and the values a boolean indicating whether
the class is a thing (True) or a stuff (False) class
threshold: confidence threshold: segments with confidence lower than this will be deleted
"""
super
().
__init__
()
self
.
threshold
=
threshold
self
.
is_thing_map
=
is_thing_map
def
forward
(
self
,
outputs
,
processed_sizes
,
target_sizes
=
None
):
#noqa: C901
""" This function computes the panoptic prediction from the model's predictions.
Parameters:
outputs: This is a dict coming directly from the model. See the model doc for the content.
processed_sizes: This is a list of tuples (or torch tensors) of sizes of the images that were passed to the
model, ie the size after data augmentation but before batching.
target_sizes: This is a list of tuples (or torch tensors) corresponding to the requested final size
of each prediction. If left to None, it will default to the processed_sizes
"""
if
target_sizes
is
None
:
target_sizes
=
processed_sizes
assert
len
(
processed_sizes
)
==
len
(
target_sizes
)
out_logits
,
raw_masks
,
raw_boxes
=
outputs
[
"pred_logits"
],
outputs
[
"pred_masks"
],
outputs
[
"pred_boxes"
]
assert
len
(
out_logits
)
==
len
(
raw_masks
)
==
len
(
target_sizes
)
preds
=
[]
def
to_tuple
(
tup
):
if
isinstance
(
tup
,
tuple
):
return
tup
return
tuple
(
tup
.
cpu
().
tolist
())
for
cur_logits
,
cur_masks
,
cur_boxes
,
size
,
target_size
in
zip
(
out_logits
,
raw_masks
,
raw_boxes
,
processed_sizes
,
target_sizes
):
# we filter empty queries and detection below threshold
scores
,
labels
=
cur_logits
.
softmax
(
-
1
).
max
(
-
1
)
keep
=
labels
.
ne
(
outputs
[
"pred_logits"
].
shape
[
-
1
]
-
1
)
&
(
scores
>
self
.
threshold
)
cur_scores
,
cur_classes
=
cur_logits
.
softmax
(
-
1
).
max
(
-
1
)
cur_scores
=
cur_scores
[
keep
]
cur_classes
=
cur_classes
[
keep
]
cur_masks
=
cur_masks
[
keep
]
cur_masks
=
interpolate
(
cur_masks
[:,
None
],
to_tuple
(
size
),
mode
=
"bilinear"
).
squeeze
(
1
)
cur_boxes
=
box_ops
.
box_cxcywh_to_xyxy
(
cur_boxes
[
keep
])
h
,
w
=
cur_masks
.
shape
[
-
2
:]
assert
len
(
cur_boxes
)
==
len
(
cur_classes
)
# It may be that we have several predicted masks for the same stuff class.
# In the following, we track the list of masks ids for each stuff class (they are merged later on)
cur_masks
=
cur_masks
.
flatten
(
1
)
stuff_equiv_classes
=
defaultdict
(
lambda
:
[])
for
k
,
label
in
enumerate
(
cur_classes
):
if
not
self
.
is_thing_map
[
label
.
item
()]:
stuff_equiv_classes
[
label
.
item
()].
append
(
k
)
def
get_ids_area
(
masks
,
scores
,
dedup
=
False
):
# This helper function creates the final panoptic segmentation image
# It also returns the area of the masks that appears on the image
m_id
=
masks
.
transpose
(
0
,
1
).
softmax
(
-
1
)
if
m_id
.
shape
[
-
1
]
==
0
:
# We didn't detect any mask :(
m_id
=
torch
.
zeros
((
h
,
w
),
dtype
=
torch
.
long
,
device
=
m_id
.
device
)
else
:
m_id
=
m_id
.
argmax
(
-
1
).
view
(
h
,
w
)
if
dedup
:
# Merge the masks corresponding to the same stuff class
for
equiv
in
stuff_equiv_classes
.
values
():
if
len
(
equiv
)
>
1
:
for
eq_id
in
equiv
:
m_id
.
masked_fill_
(
m_id
.
eq
(
eq_id
),
equiv
[
0
])
final_h
,
final_w
=
to_tuple
(
target_size
)
seg_img
=
Image
.
fromarray
(
id2rgb
(
m_id
.
view
(
h
,
w
).
cpu
().
numpy
()))
seg_img
=
seg_img
.
resize
(
size
=
(
final_w
,
final_h
),
resample
=
Image
.
NEAREST
)
np_seg_img
=
(
torch
.
ByteTensor
(
torch
.
ByteStorage
.
from_buffer
(
seg_img
.
tobytes
())).
view
(
final_h
,
final_w
,
3
).
numpy
()
)
m_id
=
torch
.
from_numpy
(
rgb2id
(
np_seg_img
))
area
=
[]
for
i
in
range
(
len
(
scores
)):
area
.
append
(
m_id
.
eq
(
i
).
sum
().
item
())
return
area
,
seg_img
area
,
seg_img
=
get_ids_area
(
cur_masks
,
cur_scores
,
dedup
=
True
)
if
cur_classes
.
numel
()
>
0
:
# We know filter empty masks as long as we find some
while
True
:
filtered_small
=
torch
.
as_tensor
(
[
area
[
i
]
<=
4
for
i
,
c
in
enumerate
(
cur_classes
)],
dtype
=
torch
.
bool
,
device
=
keep
.
device
)
if
filtered_small
.
any
().
item
():
cur_scores
=
cur_scores
[
~
filtered_small
]
cur_classes
=
cur_classes
[
~
filtered_small
]
cur_masks
=
cur_masks
[
~
filtered_small
]
area
,
seg_img
=
get_ids_area
(
cur_masks
,
cur_scores
)
else
:
break
else
:
cur_classes
=
torch
.
ones
(
1
,
dtype
=
torch
.
long
,
device
=
cur_classes
.
device
)
segments_info
=
[]
for
i
,
a
in
enumerate
(
area
):
cat
=
cur_classes
[
i
].
item
()
segments_info
.
append
({
"id"
:
i
,
"isthing"
:
self
.
is_thing_map
[
cat
],
"category_id"
:
cat
,
"area"
:
a
})
del
cur_classes
with
io
.
BytesIO
()
as
out
:
seg_img
.
save
(
out
,
format
=
"PNG"
)
predictions
=
{
"png_string"
:
out
.
getvalue
(),
"segments_info"
:
segments_info
}
preds
.
append
(
predictions
)
return
preds
projects_oss/detr/detr/models/setcriterion.py
0 → 100644
View file @
f23248c0
import
copy
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
..util
import
box_ops
from
..util.misc
import
(
nested_tensor_from_tensor_list
,
accuracy
,
get_world_size
,
interpolate
,
is_dist_avail_and_initialized
)
from
.segmentation
import
dice_loss
,
sigmoid_focal_loss
class
SetCriterion
(
nn
.
Module
):
""" This class computes the loss for DETR.
The process happens in two steps:
1) we compute hungarian assignment between ground truth boxes and the outputs of the model
2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
"""
def
__init__
(
self
,
num_classes
,
matcher
,
weight_dict
,
eos_coef
,
losses
):
""" Create the criterion.
Parameters:
num_classes: number of object categories, omitting the special no-object category
matcher: module able to compute a matching between targets and proposals
weight_dict: dict containing as key the names of the losses and as values their relative weight.
eos_coef: relative classification weight applied to the no-object category
losses: list of all the losses to be applied. See get_loss for list of available losses.
"""
super
().
__init__
()
self
.
num_classes
=
num_classes
self
.
matcher
=
matcher
self
.
weight_dict
=
weight_dict
self
.
eos_coef
=
eos_coef
self
.
losses
=
losses
empty_weight
=
torch
.
ones
(
self
.
num_classes
+
1
)
empty_weight
[
-
1
]
=
self
.
eos_coef
self
.
register_buffer
(
'empty_weight'
,
empty_weight
)
def
loss_labels
(
self
,
outputs
,
targets
,
indices
,
num_boxes
,
log
=
True
):
"""Classification loss (NLL)
targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
"""
assert
'pred_logits'
in
outputs
src_logits
=
outputs
[
'pred_logits'
]
idx
=
self
.
_get_src_permutation_idx
(
indices
)
target_classes_o
=
torch
.
cat
([
t
[
"labels"
][
J
]
for
t
,
(
_
,
J
)
in
zip
(
targets
,
indices
)])
target_classes
=
torch
.
full
(
src_logits
.
shape
[:
2
],
self
.
num_classes
,
dtype
=
torch
.
int64
,
device
=
src_logits
.
device
)
target_classes
[
idx
]
=
target_classes_o
loss_ce
=
F
.
cross_entropy
(
src_logits
.
transpose
(
1
,
2
),
target_classes
,
self
.
empty_weight
)
losses
=
{
'loss_ce'
:
loss_ce
}
if
log
:
# TODO this should probably be a separate loss, not hacked in this one here
losses
[
'class_error'
]
=
100
-
accuracy
(
src_logits
[
idx
],
target_classes_o
)[
0
]
return
losses
@
torch
.
no_grad
()
def
loss_cardinality
(
self
,
outputs
,
targets
,
indices
,
num_boxes
):
""" Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes
This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients
"""
pred_logits
=
outputs
[
'pred_logits'
]
device
=
pred_logits
.
device
tgt_lengths
=
torch
.
as_tensor
([
len
(
v
[
"labels"
])
for
v
in
targets
],
device
=
device
)
# Count the number of predictions that are NOT "no-object" (which is the last class)
card_pred
=
(
pred_logits
.
argmax
(
-
1
)
!=
pred_logits
.
shape
[
-
1
]
-
1
).
sum
(
1
)
card_err
=
F
.
l1_loss
(
card_pred
.
float
(),
tgt_lengths
.
float
())
losses
=
{
'cardinality_error'
:
card_err
}
return
losses
def
loss_boxes
(
self
,
outputs
,
targets
,
indices
,
num_boxes
):
"""Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
"""
assert
'pred_boxes'
in
outputs
idx
=
self
.
_get_src_permutation_idx
(
indices
)
src_boxes
=
outputs
[
'pred_boxes'
][
idx
]
target_boxes
=
torch
.
cat
([
t
[
'boxes'
][
i
]
for
t
,
(
_
,
i
)
in
zip
(
targets
,
indices
)],
dim
=
0
)
loss_bbox
=
F
.
l1_loss
(
src_boxes
,
target_boxes
,
reduction
=
'none'
)
losses
=
{}
losses
[
'loss_bbox'
]
=
loss_bbox
.
sum
()
/
num_boxes
loss_giou
=
1
-
torch
.
diag
(
box_ops
.
generalized_box_iou
(
box_ops
.
box_cxcywh_to_xyxy
(
src_boxes
),
box_ops
.
box_cxcywh_to_xyxy
(
target_boxes
)))
losses
[
'loss_giou'
]
=
loss_giou
.
sum
()
/
num_boxes
return
losses
def
loss_masks
(
self
,
outputs
,
targets
,
indices
,
num_boxes
):
"""Compute the losses related to the masks: the focal loss and the dice loss.
targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
"""
assert
"pred_masks"
in
outputs
src_idx
=
self
.
_get_src_permutation_idx
(
indices
)
tgt_idx
=
self
.
_get_tgt_permutation_idx
(
indices
)
src_masks
=
outputs
[
"pred_masks"
]
src_masks
=
src_masks
[
src_idx
]
masks
=
[
t
[
"masks"
]
for
t
in
targets
]
# TODO use valid to mask invalid areas due to padding in loss
target_masks
,
valid
=
nested_tensor_from_tensor_list
(
masks
).
decompose
()
target_masks
=
target_masks
.
to
(
src_masks
)
target_masks
=
target_masks
[
tgt_idx
]
# upsample predictions to the target size
src_masks
=
interpolate
(
src_masks
[:,
None
],
size
=
target_masks
.
shape
[
-
2
:],
mode
=
"bilinear"
,
align_corners
=
False
)
src_masks
=
src_masks
[:,
0
].
flatten
(
1
)
target_masks
=
target_masks
.
flatten
(
1
)
target_masks
=
target_masks
.
view
(
src_masks
.
shape
)
losses
=
{
"loss_mask"
:
sigmoid_focal_loss
(
src_masks
,
target_masks
,
num_boxes
),
"loss_dice"
:
dice_loss
(
src_masks
,
target_masks
,
num_boxes
),
}
return
losses
def
_get_src_permutation_idx
(
self
,
indices
):
# permute predictions following indices
batch_idx
=
torch
.
cat
([
torch
.
full_like
(
src
,
i
)
for
i
,
(
src
,
_
)
in
enumerate
(
indices
)])
src_idx
=
torch
.
cat
([
src
for
(
src
,
_
)
in
indices
])
return
batch_idx
,
src_idx
def
_get_tgt_permutation_idx
(
self
,
indices
):
# permute targets following indices
batch_idx
=
torch
.
cat
([
torch
.
full_like
(
tgt
,
i
)
for
i
,
(
_
,
tgt
)
in
enumerate
(
indices
)])
tgt_idx
=
torch
.
cat
([
tgt
for
(
_
,
tgt
)
in
indices
])
return
batch_idx
,
tgt_idx
def
get_loss
(
self
,
loss
,
outputs
,
targets
,
indices
,
num_boxes
,
**
kwargs
):
loss_map
=
{
'labels'
:
self
.
loss_labels
,
'cardinality'
:
self
.
loss_cardinality
,
'boxes'
:
self
.
loss_boxes
,
'masks'
:
self
.
loss_masks
}
assert
loss
in
loss_map
,
f
'do you really want to compute
{
loss
}
loss?'
return
loss_map
[
loss
](
outputs
,
targets
,
indices
,
num_boxes
,
**
kwargs
)
def
forward
(
self
,
outputs
,
targets
):
""" This performs the loss computation.
Parameters:
outputs: dict of tensors, see the output specification of the model for the format
targets: list of dicts, such that len(targets) == batch_size.
The expected keys in each dict depends on the losses applied, see each loss' doc
"""
outputs_without_aux
=
{
k
:
v
for
k
,
v
in
outputs
.
items
()
if
k
!=
'aux_outputs'
}
# Retrieve the matching between the outputs of the last layer and the targets
indices
=
self
.
matcher
(
outputs_without_aux
,
targets
)
# Compute the average number of target boxes accross all nodes, for normalization purposes
num_boxes
=
sum
(
len
(
t
[
"labels"
])
for
t
in
targets
)
num_boxes
=
torch
.
as_tensor
([
num_boxes
],
dtype
=
torch
.
float
,
device
=
next
(
iter
(
outputs
.
values
())).
device
)
if
is_dist_avail_and_initialized
():
torch
.
distributed
.
all_reduce
(
num_boxes
)
num_boxes
=
torch
.
clamp
(
num_boxes
/
get_world_size
(),
min
=
1
).
item
()
# Compute all the requested losses
losses
=
{}
for
loss
in
self
.
losses
:
losses
.
update
(
self
.
get_loss
(
loss
,
outputs
,
targets
,
indices
,
num_boxes
))
# In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
if
'aux_outputs'
in
outputs
:
for
i
,
aux_outputs
in
enumerate
(
outputs
[
'aux_outputs'
]):
indices
=
self
.
matcher
(
aux_outputs
,
targets
)
for
loss
in
self
.
losses
:
if
loss
==
'masks'
:
# Intermediate masks losses are too costly to compute, we ignore them.
continue
kwargs
=
{}
if
loss
==
'labels'
:
# Logging is enabled only for the last layer
kwargs
=
{
'log'
:
False
}
l_dict
=
self
.
get_loss
(
loss
,
aux_outputs
,
targets
,
indices
,
num_boxes
,
**
kwargs
)
l_dict
=
{
k
+
f
'_
{
i
}
'
:
v
for
k
,
v
in
l_dict
.
items
()}
losses
.
update
(
l_dict
)
return
losses
class
FocalLossSetCriterion
(
nn
.
Module
):
""" This class computes the loss for DETR.
The process happens in two steps:
1) we compute hungarian assignment between ground truth boxes and the outputs of the model
2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
"""
def
__init__
(
self
,
num_classes
,
matcher
,
weight_dict
,
losses
,
focal_alpha
=
0.25
):
""" Create the criterion.
Parameters:
num_classes: number of object categories, omitting the special no-object category
matcher: module able to compute a matching between targets and proposals
weight_dict: dict containing as key the names of the losses and as values their relative weight.
losses: list of all the losses to be applied. See get_loss for list of available losses.
focal_alpha: alpha in Focal Loss
"""
super
().
__init__
()
self
.
num_classes
=
num_classes
self
.
matcher
=
matcher
self
.
weight_dict
=
weight_dict
self
.
losses
=
losses
self
.
focal_alpha
=
focal_alpha
def
loss_labels
(
self
,
outputs
,
targets
,
indices
,
num_boxes
,
log
=
True
):
"""Classification loss (NLL)
targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
"""
assert
'pred_logits'
in
outputs
src_logits
=
outputs
[
'pred_logits'
]
idx
=
self
.
_get_src_permutation_idx
(
indices
)
target_classes_o
=
torch
.
cat
([
t
[
"labels"
][
J
]
for
t
,
(
_
,
J
)
in
zip
(
targets
,
indices
)])
target_classes
=
torch
.
full
(
src_logits
.
shape
[:
2
],
self
.
num_classes
,
dtype
=
torch
.
int64
,
device
=
src_logits
.
device
)
target_classes
[
idx
]
=
target_classes_o
target_classes_onehot
=
torch
.
zeros
([
src_logits
.
shape
[
0
],
src_logits
.
shape
[
1
],
src_logits
.
shape
[
2
]
+
1
],
dtype
=
src_logits
.
dtype
,
layout
=
src_logits
.
layout
,
device
=
src_logits
.
device
)
target_classes_onehot
.
scatter_
(
2
,
target_classes
.
unsqueeze
(
-
1
),
1
)
target_classes_onehot
=
target_classes_onehot
[:,:,:
-
1
]
loss_ce
=
sigmoid_focal_loss
(
src_logits
,
target_classes_onehot
,
num_boxes
,
alpha
=
self
.
focal_alpha
,
gamma
=
2
)
*
src_logits
.
shape
[
1
]
losses
=
{
'loss_ce'
:
loss_ce
}
if
log
:
# TODO this should probably be a separate loss, not hacked in this one here
losses
[
'class_error'
]
=
100
-
accuracy
(
src_logits
[
idx
],
target_classes_o
)[
0
]
return
losses
@
torch
.
no_grad
()
def
loss_cardinality
(
self
,
outputs
,
targets
,
indices
,
num_boxes
):
""" Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes
This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients
"""
pred_logits
=
outputs
[
'pred_logits'
]
device
=
pred_logits
.
device
tgt_lengths
=
torch
.
as_tensor
([
len
(
v
[
"labels"
])
for
v
in
targets
],
device
=
device
)
# Count the number of predictions that are NOT "no-object" (which is the last class)
card_pred
=
(
pred_logits
.
argmax
(
-
1
)
!=
pred_logits
.
shape
[
-
1
]
-
1
).
sum
(
1
)
card_err
=
F
.
l1_loss
(
card_pred
.
float
(),
tgt_lengths
.
float
())
losses
=
{
'cardinality_error'
:
card_err
}
return
losses
def
loss_boxes
(
self
,
outputs
,
targets
,
indices
,
num_boxes
):
"""Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
The target boxes are expected in format (center_x, center_y, h, w), normalized by the image size.
"""
assert
'pred_boxes'
in
outputs
idx
=
self
.
_get_src_permutation_idx
(
indices
)
src_boxes
=
outputs
[
'pred_boxes'
][
idx
]
target_boxes
=
torch
.
cat
([
t
[
'boxes'
][
i
]
for
t
,
(
_
,
i
)
in
zip
(
targets
,
indices
)],
dim
=
0
)
loss_bbox
=
F
.
l1_loss
(
src_boxes
,
target_boxes
,
reduction
=
'none'
)
losses
=
{}
losses
[
'loss_bbox'
]
=
loss_bbox
.
sum
()
/
num_boxes
loss_giou
=
1
-
torch
.
diag
(
box_ops
.
generalized_box_iou
(
box_ops
.
box_cxcywh_to_xyxy
(
src_boxes
),
box_ops
.
box_cxcywh_to_xyxy
(
target_boxes
)))
losses
[
'loss_giou'
]
=
loss_giou
.
sum
()
/
num_boxes
return
losses
def
loss_masks
(
self
,
outputs
,
targets
,
indices
,
num_boxes
):
"""Compute the losses related to the masks: the focal loss and the dice loss.
targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
"""
assert
"pred_masks"
in
outputs
src_idx
=
self
.
_get_src_permutation_idx
(
indices
)
tgt_idx
=
self
.
_get_tgt_permutation_idx
(
indices
)
src_masks
=
outputs
[
"pred_masks"
]
# TODO use valid to mask invalid areas due to padding in loss
target_masks
,
valid
=
nested_tensor_from_tensor_list
([
t
[
"masks"
]
for
t
in
targets
]).
decompose
()
target_masks
=
target_masks
.
to
(
src_masks
)
src_masks
=
src_masks
[
src_idx
]
# upsample predictions to the target size
src_masks
=
interpolate
(
src_masks
[:,
None
],
size
=
target_masks
.
shape
[
-
2
:],
mode
=
"bilinear"
,
align_corners
=
False
)
src_masks
=
src_masks
[:,
0
].
flatten
(
1
)
target_masks
=
target_masks
[
tgt_idx
].
flatten
(
1
)
losses
=
{
"loss_mask"
:
sigmoid_focal_loss
(
src_masks
,
target_masks
,
num_boxes
),
"loss_dice"
:
dice_loss
(
src_masks
,
target_masks
,
num_boxes
),
}
return
losses
def
_get_src_permutation_idx
(
self
,
indices
):
# permute predictions following indices
batch_idx
=
torch
.
cat
([
torch
.
full_like
(
src
,
i
)
for
i
,
(
src
,
_
)
in
enumerate
(
indices
)])
src_idx
=
torch
.
cat
([
src
for
(
src
,
_
)
in
indices
])
return
batch_idx
,
src_idx
def
_get_tgt_permutation_idx
(
self
,
indices
):
# permute targets following indices
batch_idx
=
torch
.
cat
([
torch
.
full_like
(
tgt
,
i
)
for
i
,
(
_
,
tgt
)
in
enumerate
(
indices
)])
tgt_idx
=
torch
.
cat
([
tgt
for
(
_
,
tgt
)
in
indices
])
return
batch_idx
,
tgt_idx
def
get_loss
(
self
,
loss
,
outputs
,
targets
,
indices
,
num_boxes
,
**
kwargs
):
loss_map
=
{
'labels'
:
self
.
loss_labels
,
'cardinality'
:
self
.
loss_cardinality
,
'boxes'
:
self
.
loss_boxes
,
'masks'
:
self
.
loss_masks
}
assert
loss
in
loss_map
,
f
'do you really want to compute
{
loss
}
loss?'
return
loss_map
[
loss
](
outputs
,
targets
,
indices
,
num_boxes
,
**
kwargs
)
def
forward
(
self
,
outputs
,
targets
):
""" This performs the loss computation.
Parameters:
outputs: dict of tensors, see the output specification of the model for the format
targets: list of dicts, such that len(targets) == batch_size.
The expected keys in each dict depends on the losses applied, see each loss' doc
"""
outputs_without_aux
=
{
k
:
v
for
k
,
v
in
outputs
.
items
()
if
k
!=
'aux_outputs'
and
k
!=
'enc_outputs'
}
# Retrieve the matching between the outputs of the last layer and the targets
indices
=
self
.
matcher
(
outputs_without_aux
,
targets
)
# Compute the average number of target boxes accross all nodes, for normalization purposes
num_boxes
=
sum
(
len
(
t
[
"labels"
])
for
t
in
targets
)
num_boxes
=
torch
.
as_tensor
([
num_boxes
],
dtype
=
torch
.
float
,
device
=
next
(
iter
(
outputs
.
values
())).
device
)
if
is_dist_avail_and_initialized
():
torch
.
distributed
.
all_reduce
(
num_boxes
)
num_boxes
=
torch
.
clamp
(
num_boxes
/
get_world_size
(),
min
=
1
).
item
()
# Compute all the requested losses
losses
=
{}
for
loss
in
self
.
losses
:
kwargs
=
{}
losses
.
update
(
self
.
get_loss
(
loss
,
outputs
,
targets
,
indices
,
num_boxes
,
**
kwargs
))
# In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
if
'aux_outputs'
in
outputs
:
for
i
,
aux_outputs
in
enumerate
(
outputs
[
'aux_outputs'
]):
indices
=
self
.
matcher
(
aux_outputs
,
targets
)
for
loss
in
self
.
losses
:
if
loss
==
'masks'
:
# Intermediate masks losses are too costly to compute, we ignore them.
continue
kwargs
=
{}
if
loss
==
'labels'
:
# Logging is enabled only for the last layer
kwargs
[
'log'
]
=
False
l_dict
=
self
.
get_loss
(
loss
,
aux_outputs
,
targets
,
indices
,
num_boxes
,
**
kwargs
)
l_dict
=
{
k
+
f
'_
{
i
}
'
:
v
for
k
,
v
in
l_dict
.
items
()}
losses
.
update
(
l_dict
)
if
'enc_outputs'
in
outputs
:
enc_outputs
=
outputs
[
'enc_outputs'
]
bin_targets
=
copy
.
deepcopy
(
targets
)
for
bt
in
bin_targets
:
bt
[
'labels'
]
=
torch
.
zeros_like
(
bt
[
'labels'
])
indices
=
self
.
matcher
(
enc_outputs
,
bin_targets
)
for
loss
in
self
.
losses
:
if
loss
==
'masks'
:
# Intermediate masks losses are too costly to compute, we ignore them.
continue
kwargs
=
{}
if
loss
==
'labels'
:
# Logging is enabled only for the last layer
kwargs
[
'log'
]
=
False
l_dict
=
self
.
get_loss
(
loss
,
enc_outputs
,
bin_targets
,
indices
,
num_boxes
,
**
kwargs
)
l_dict
=
{
k
+
f
'_enc'
:
v
for
k
,
v
in
l_dict
.
items
()}
losses
.
update
(
l_dict
)
return
losses
projects_oss/detr/detr/models/transformer.py
0 → 100644
View file @
f23248c0
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
DETR Transformer class.
Copy-paste from torch.nn.Transformer with modifications:
* positional encodings are passed in MHattention
* extra LN at the end of encoder is removed
* decoder returns a stack of activations from all decoding layers
"""
import
copy
from
typing
import
Optional
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
,
Tensor
class
Transformer
(
nn
.
Module
):
def
__init__
(
self
,
d_model
=
512
,
nhead
=
8
,
num_encoder_layers
=
6
,
num_decoder_layers
=
6
,
dim_feedforward
=
2048
,
dropout
=
0.1
,
activation
=
"relu"
,
normalize_before
=
False
,
return_intermediate_dec
=
False
):
super
().
__init__
()
encoder_layer
=
TransformerEncoderLayer
(
d_model
,
nhead
,
dim_feedforward
,
dropout
,
activation
,
normalize_before
)
encoder_norm
=
nn
.
LayerNorm
(
d_model
)
if
normalize_before
else
None
self
.
encoder
=
TransformerEncoder
(
encoder_layer
,
num_encoder_layers
,
encoder_norm
)
decoder_layer
=
TransformerDecoderLayer
(
d_model
,
nhead
,
dim_feedforward
,
dropout
,
activation
,
normalize_before
)
decoder_norm
=
nn
.
LayerNorm
(
d_model
)
self
.
decoder
=
TransformerDecoder
(
decoder_layer
,
num_decoder_layers
,
decoder_norm
,
return_intermediate
=
return_intermediate_dec
)
self
.
_reset_parameters
()
self
.
d_model
=
d_model
self
.
nhead
=
nhead
def
_reset_parameters
(
self
):
for
p
in
self
.
parameters
():
if
p
.
dim
()
>
1
:
nn
.
init
.
xavier_uniform_
(
p
)
def
forward
(
self
,
src
,
mask
,
query_embed
,
pos_embed
):
# flatten NxCxHxW to HWxNxC
bs
,
c
,
h
,
w
=
src
.
shape
src
=
src
.
flatten
(
2
).
permute
(
2
,
0
,
1
)
pos_embed
=
pos_embed
.
flatten
(
2
).
permute
(
2
,
0
,
1
)
query_embed
=
query_embed
.
unsqueeze
(
1
).
repeat
(
1
,
bs
,
1
)
mask
=
mask
.
flatten
(
1
)
tgt
=
torch
.
zeros_like
(
query_embed
)
memory
=
self
.
encoder
(
src
,
src_key_padding_mask
=
mask
,
pos
=
pos_embed
)
hs
=
self
.
decoder
(
tgt
,
memory
,
memory_key_padding_mask
=
mask
,
pos
=
pos_embed
,
query_pos
=
query_embed
)
return
hs
.
transpose
(
1
,
2
),
memory
.
permute
(
1
,
2
,
0
).
view
(
bs
,
c
,
h
,
w
)
class
TransformerEncoder
(
nn
.
Module
):
def
__init__
(
self
,
encoder_layer
,
num_layers
,
norm
=
None
):
super
().
__init__
()
self
.
layers
=
_get_clones
(
encoder_layer
,
num_layers
)
self
.
num_layers
=
num_layers
self
.
norm
=
norm
def
forward
(
self
,
src
,
mask
:
Optional
[
Tensor
]
=
None
,
src_key_padding_mask
:
Optional
[
Tensor
]
=
None
,
pos
:
Optional
[
Tensor
]
=
None
):
output
=
src
for
layer
in
self
.
layers
:
output
=
layer
(
output
,
src_mask
=
mask
,
src_key_padding_mask
=
src_key_padding_mask
,
pos
=
pos
)
if
self
.
norm
is
not
None
:
output
=
self
.
norm
(
output
)
return
output
class
TransformerDecoder
(
nn
.
Module
):
def
__init__
(
self
,
decoder_layer
,
num_layers
,
norm
=
None
,
return_intermediate
=
False
):
super
().
__init__
()
self
.
layers
=
_get_clones
(
decoder_layer
,
num_layers
)
self
.
num_layers
=
num_layers
self
.
norm
=
norm
self
.
return_intermediate
=
return_intermediate
def
forward
(
self
,
tgt
,
memory
,
tgt_mask
:
Optional
[
Tensor
]
=
None
,
memory_mask
:
Optional
[
Tensor
]
=
None
,
tgt_key_padding_mask
:
Optional
[
Tensor
]
=
None
,
memory_key_padding_mask
:
Optional
[
Tensor
]
=
None
,
pos
:
Optional
[
Tensor
]
=
None
,
query_pos
:
Optional
[
Tensor
]
=
None
):
output
=
tgt
intermediate
=
[]
for
layer
in
self
.
layers
:
output
=
layer
(
output
,
memory
,
tgt_mask
=
tgt_mask
,
memory_mask
=
memory_mask
,
tgt_key_padding_mask
=
tgt_key_padding_mask
,
memory_key_padding_mask
=
memory_key_padding_mask
,
pos
=
pos
,
query_pos
=
query_pos
)
if
self
.
return_intermediate
:
intermediate
.
append
(
self
.
norm
(
output
))
if
self
.
norm
is
not
None
:
output
=
self
.
norm
(
output
)
if
self
.
return_intermediate
:
intermediate
.
pop
()
intermediate
.
append
(
output
)
if
self
.
return_intermediate
:
return
torch
.
stack
(
intermediate
)
return
output
.
unsqueeze
(
0
)
class
TransformerEncoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
d_model
,
nhead
,
dim_feedforward
=
2048
,
dropout
=
0.1
,
activation
=
"relu"
,
normalize_before
=
False
):
super
().
__init__
()
self
.
self_attn
=
nn
.
MultiheadAttention
(
d_model
,
nhead
,
dropout
=
dropout
)
# Implementation of Feedforward model
self
.
linear1
=
nn
.
Linear
(
d_model
,
dim_feedforward
)
self
.
dropout
=
nn
.
Dropout
(
dropout
)
self
.
linear2
=
nn
.
Linear
(
dim_feedforward
,
d_model
)
self
.
norm1
=
nn
.
LayerNorm
(
d_model
)
self
.
norm2
=
nn
.
LayerNorm
(
d_model
)
self
.
dropout1
=
nn
.
Dropout
(
dropout
)
self
.
dropout2
=
nn
.
Dropout
(
dropout
)
self
.
activation
=
_get_activation_fn
(
activation
)
self
.
normalize_before
=
normalize_before
def
with_pos_embed
(
self
,
tensor
,
pos
:
Optional
[
Tensor
]):
return
tensor
if
pos
is
None
else
tensor
+
pos
def
forward_post
(
self
,
src
,
src_mask
:
Optional
[
Tensor
]
=
None
,
src_key_padding_mask
:
Optional
[
Tensor
]
=
None
,
pos
:
Optional
[
Tensor
]
=
None
):
q
=
k
=
self
.
with_pos_embed
(
src
,
pos
)
src2
=
self
.
self_attn
(
q
,
k
,
src
,
attn_mask
=
src_mask
,
key_padding_mask
=
src_key_padding_mask
)[
0
]
src
=
src
+
self
.
dropout1
(
src2
)
src
=
self
.
norm1
(
src
)
src2
=
self
.
linear2
(
self
.
dropout
(
self
.
activation
(
self
.
linear1
(
src
))))
src
=
src
+
self
.
dropout2
(
src2
)
src
=
self
.
norm2
(
src
)
return
src
def
forward_pre
(
self
,
src
,
src_mask
:
Optional
[
Tensor
]
=
None
,
src_key_padding_mask
:
Optional
[
Tensor
]
=
None
,
pos
:
Optional
[
Tensor
]
=
None
):
src2
=
self
.
norm1
(
src
)
q
=
k
=
self
.
with_pos_embed
(
src2
,
pos
)
src2
=
self
.
self_attn
(
q
,
k
,
src2
,
attn_mask
=
src_mask
,
key_padding_mask
=
src_key_padding_mask
)[
0
]
src
=
src
+
self
.
dropout1
(
src2
)
src2
=
self
.
norm2
(
src
)
src2
=
self
.
linear2
(
self
.
dropout
(
self
.
activation
(
self
.
linear1
(
src2
))))
src
=
src
+
self
.
dropout2
(
src2
)
return
src
def
forward
(
self
,
src
,
src_mask
:
Optional
[
Tensor
]
=
None
,
src_key_padding_mask
:
Optional
[
Tensor
]
=
None
,
pos
:
Optional
[
Tensor
]
=
None
):
if
self
.
normalize_before
:
return
self
.
forward_pre
(
src
,
src_mask
,
src_key_padding_mask
,
pos
)
return
self
.
forward_post
(
src
,
src_mask
,
src_key_padding_mask
,
pos
)
class
TransformerDecoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
d_model
,
nhead
,
dim_feedforward
=
2048
,
dropout
=
0.1
,
activation
=
"relu"
,
normalize_before
=
False
):
super
().
__init__
()
self
.
self_attn
=
nn
.
MultiheadAttention
(
d_model
,
nhead
,
dropout
=
dropout
)
self
.
multihead_attn
=
nn
.
MultiheadAttention
(
d_model
,
nhead
,
dropout
=
dropout
)
# Implementation of Feedforward model
self
.
linear1
=
nn
.
Linear
(
d_model
,
dim_feedforward
)
self
.
dropout
=
nn
.
Dropout
(
dropout
)
self
.
linear2
=
nn
.
Linear
(
dim_feedforward
,
d_model
)
self
.
norm1
=
nn
.
LayerNorm
(
d_model
)
self
.
norm2
=
nn
.
LayerNorm
(
d_model
)
self
.
norm3
=
nn
.
LayerNorm
(
d_model
)
self
.
dropout1
=
nn
.
Dropout
(
dropout
)
self
.
dropout2
=
nn
.
Dropout
(
dropout
)
self
.
dropout3
=
nn
.
Dropout
(
dropout
)
self
.
activation
=
_get_activation_fn
(
activation
)
self
.
normalize_before
=
normalize_before
def
with_pos_embed
(
self
,
tensor
,
pos
:
Optional
[
Tensor
]):
return
tensor
if
pos
is
None
else
tensor
+
pos
def
forward_post
(
self
,
tgt
,
memory
,
tgt_mask
:
Optional
[
Tensor
]
=
None
,
memory_mask
:
Optional
[
Tensor
]
=
None
,
tgt_key_padding_mask
:
Optional
[
Tensor
]
=
None
,
memory_key_padding_mask
:
Optional
[
Tensor
]
=
None
,
pos
:
Optional
[
Tensor
]
=
None
,
query_pos
:
Optional
[
Tensor
]
=
None
):
q
=
k
=
self
.
with_pos_embed
(
tgt
,
query_pos
)
tgt2
=
self
.
self_attn
(
q
,
k
,
tgt
,
attn_mask
=
tgt_mask
,
key_padding_mask
=
tgt_key_padding_mask
)[
0
]
tgt
=
tgt
+
self
.
dropout1
(
tgt2
)
tgt
=
self
.
norm1
(
tgt
)
tgt2
=
self
.
multihead_attn
(
self
.
with_pos_embed
(
tgt
,
query_pos
),
self
.
with_pos_embed
(
memory
,
pos
),
memory
,
attn_mask
=
memory_mask
,
key_padding_mask
=
memory_key_padding_mask
)[
0
]
tgt
=
tgt
+
self
.
dropout2
(
tgt2
)
tgt
=
self
.
norm2
(
tgt
)
tgt2
=
self
.
linear2
(
self
.
dropout
(
self
.
activation
(
self
.
linear1
(
tgt
))))
tgt
=
tgt
+
self
.
dropout3
(
tgt2
)
tgt
=
self
.
norm3
(
tgt
)
return
tgt
def
forward_pre
(
self
,
tgt
,
memory
,
tgt_mask
:
Optional
[
Tensor
]
=
None
,
memory_mask
:
Optional
[
Tensor
]
=
None
,
tgt_key_padding_mask
:
Optional
[
Tensor
]
=
None
,
memory_key_padding_mask
:
Optional
[
Tensor
]
=
None
,
pos
:
Optional
[
Tensor
]
=
None
,
query_pos
:
Optional
[
Tensor
]
=
None
):
tgt2
=
self
.
norm1
(
tgt
)
q
=
k
=
self
.
with_pos_embed
(
tgt2
,
query_pos
)
tgt2
=
self
.
self_attn
(
q
,
k
,
tgt2
,
attn_mask
=
tgt_mask
,
key_padding_mask
=
tgt_key_padding_mask
)[
0
]
tgt
=
tgt
+
self
.
dropout1
(
tgt2
)
tgt2
=
self
.
norm2
(
tgt
)
tgt2
=
self
.
multihead_attn
(
self
.
with_pos_embed
(
tgt2
,
query_pos
),
self
.
with_pos_embed
(
memory
,
pos
),
memory
,
attn_mask
=
memory_mask
,
key_padding_mask
=
memory_key_padding_mask
)[
0
]
tgt
=
tgt
+
self
.
dropout2
(
tgt2
)
tgt2
=
self
.
norm3
(
tgt
)
tgt2
=
self
.
linear2
(
self
.
dropout
(
self
.
activation
(
self
.
linear1
(
tgt2
))))
tgt
=
tgt
+
self
.
dropout3
(
tgt2
)
return
tgt
def
forward
(
self
,
tgt
,
memory
,
tgt_mask
:
Optional
[
Tensor
]
=
None
,
memory_mask
:
Optional
[
Tensor
]
=
None
,
tgt_key_padding_mask
:
Optional
[
Tensor
]
=
None
,
memory_key_padding_mask
:
Optional
[
Tensor
]
=
None
,
pos
:
Optional
[
Tensor
]
=
None
,
query_pos
:
Optional
[
Tensor
]
=
None
):
if
self
.
normalize_before
:
return
self
.
forward_pre
(
tgt
,
memory
,
tgt_mask
,
memory_mask
,
tgt_key_padding_mask
,
memory_key_padding_mask
,
pos
,
query_pos
)
return
self
.
forward_post
(
tgt
,
memory
,
tgt_mask
,
memory_mask
,
tgt_key_padding_mask
,
memory_key_padding_mask
,
pos
,
query_pos
)
def
_get_clones
(
module
,
N
):
return
nn
.
ModuleList
([
copy
.
deepcopy
(
module
)
for
i
in
range
(
N
)])
def
build_transformer
(
args
):
return
Transformer
(
d_model
=
args
.
hidden_dim
,
dropout
=
args
.
dropout
,
nhead
=
args
.
nheads
,
dim_feedforward
=
args
.
dim_feedforward
,
num_encoder_layers
=
args
.
enc_layers
,
num_decoder_layers
=
args
.
dec_layers
,
normalize_before
=
args
.
pre_norm
,
return_intermediate_dec
=
True
,
)
def
_get_activation_fn
(
activation
):
"""Return an activation function given a string"""
if
activation
==
"relu"
:
return
F
.
relu
if
activation
==
"gelu"
:
return
F
.
gelu
if
activation
==
"glu"
:
return
F
.
glu
raise
RuntimeError
(
F
"activation should be relu/gelu, not
{
activation
}
."
)
projects_oss/detr/detr/modules/__init__.py
0 → 100644
View file @
f23248c0
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
from
.ms_deform_attn
import
MSDeformAttn
projects_oss/detr/detr/modules/ms_deform_attn.py
0 → 100644
View file @
f23248c0
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
from
__future__
import
absolute_import
from
__future__
import
print_function
from
__future__
import
division
import
warnings
import
math
import
torch
from
torch
import
nn
import
torch.nn.functional
as
F
from
torch.nn.init
import
xavier_uniform_
,
constant_
from
..functions
import
MSDeformAttnFunction
def
_is_power_of_2
(
n
):
if
(
not
isinstance
(
n
,
int
))
or
(
n
<
0
):
raise
ValueError
(
"invalid input for _is_power_of_2: {} (type: {})"
.
format
(
n
,
type
(
n
)))
return
(
n
&
(
n
-
1
)
==
0
)
and
n
!=
0
class
MSDeformAttn
(
nn
.
Module
):
def
__init__
(
self
,
d_model
=
256
,
n_levels
=
4
,
n_heads
=
8
,
n_points
=
4
):
"""
Multi-Scale Deformable Attention Module
:param d_model hidden dimension
:param n_levels number of feature levels
:param n_heads number of attention heads
:param n_points number of sampling points per attention head per feature level
"""
super
().
__init__
()
if
d_model
%
n_heads
!=
0
:
raise
ValueError
(
'd_model must be divisible by n_heads, but got {} and {}'
.
format
(
d_model
,
n_heads
))
_d_per_head
=
d_model
//
n_heads
# you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
if
not
_is_power_of_2
(
_d_per_head
):
warnings
.
warn
(
"You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
"which is more efficient in our CUDA implementation."
)
self
.
im2col_step
=
64
self
.
d_model
=
d_model
self
.
n_levels
=
n_levels
self
.
n_heads
=
n_heads
self
.
n_points
=
n_points
self
.
sampling_offsets
=
nn
.
Linear
(
d_model
,
n_heads
*
n_levels
*
n_points
*
2
)
self
.
attention_weights
=
nn
.
Linear
(
d_model
,
n_heads
*
n_levels
*
n_points
)
self
.
value_proj
=
nn
.
Linear
(
d_model
,
d_model
)
self
.
output_proj
=
nn
.
Linear
(
d_model
,
d_model
)
self
.
_reset_parameters
()
def
_reset_parameters
(
self
):
constant_
(
self
.
sampling_offsets
.
weight
.
data
,
0.
)
thetas
=
torch
.
arange
(
self
.
n_heads
,
dtype
=
torch
.
float32
)
*
(
2.0
*
math
.
pi
/
self
.
n_heads
)
grid_init
=
torch
.
stack
([
thetas
.
cos
(),
thetas
.
sin
()],
-
1
)
grid_init
=
(
grid_init
/
grid_init
.
abs
().
max
(
-
1
,
keepdim
=
True
)[
0
]).
view
(
self
.
n_heads
,
1
,
1
,
2
).
repeat
(
1
,
self
.
n_levels
,
self
.
n_points
,
1
)
for
i
in
range
(
self
.
n_points
):
grid_init
[:,
:,
i
,
:]
*=
i
+
1
with
torch
.
no_grad
():
self
.
sampling_offsets
.
bias
=
nn
.
Parameter
(
grid_init
.
view
(
-
1
))
constant_
(
self
.
attention_weights
.
weight
.
data
,
0.
)
constant_
(
self
.
attention_weights
.
bias
.
data
,
0.
)
xavier_uniform_
(
self
.
value_proj
.
weight
.
data
)
constant_
(
self
.
value_proj
.
bias
.
data
,
0.
)
xavier_uniform_
(
self
.
output_proj
.
weight
.
data
)
constant_
(
self
.
output_proj
.
bias
.
data
,
0.
)
def
forward
(
self
,
query
,
reference_points
,
input_flatten
,
input_spatial_shapes
,
input_level_start_index
,
input_padding_mask
=
None
):
"""
:param query (N, Length_{query}, C)
:param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
:param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
:param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
:param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
:param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
:return output (N, Length_{query}, C)
"""
N
,
Len_q
,
_
=
query
.
shape
N
,
Len_in
,
_
=
input_flatten
.
shape
assert
(
input_spatial_shapes
[:,
0
]
*
input_spatial_shapes
[:,
1
]).
sum
()
==
Len_in
value
=
self
.
value_proj
(
input_flatten
)
if
input_padding_mask
is
not
None
:
value
=
value
.
masked_fill
(
input_padding_mask
[...,
None
],
float
(
0
))
value
=
value
.
view
(
N
,
Len_in
,
self
.
n_heads
,
self
.
d_model
//
self
.
n_heads
)
sampling_offsets
=
self
.
sampling_offsets
(
query
).
view
(
N
,
Len_q
,
self
.
n_heads
,
self
.
n_levels
,
self
.
n_points
,
2
)
attention_weights
=
self
.
attention_weights
(
query
).
view
(
N
,
Len_q
,
self
.
n_heads
,
self
.
n_levels
*
self
.
n_points
)
attention_weights
=
F
.
softmax
(
attention_weights
,
-
1
).
view
(
N
,
Len_q
,
self
.
n_heads
,
self
.
n_levels
,
self
.
n_points
)
# N, Len_q, n_heads, n_levels, n_points, 2
if
reference_points
.
shape
[
-
1
]
==
2
:
offset_normalizer
=
torch
.
stack
([
input_spatial_shapes
[...,
1
],
input_spatial_shapes
[...,
0
]],
-
1
)
sampling_locations
=
reference_points
[:,
:,
None
,
:,
None
,
:]
\
+
sampling_offsets
/
offset_normalizer
[
None
,
None
,
None
,
:,
None
,
:]
elif
reference_points
.
shape
[
-
1
]
==
4
:
sampling_locations
=
reference_points
[:,
:,
None
,
:,
None
,
:
2
]
\
+
sampling_offsets
/
self
.
n_points
*
reference_points
[:,
:,
None
,
:,
None
,
2
:]
*
0.5
else
:
raise
ValueError
(
'Last dim of reference_points must be 2 or 4, but get {} instead.'
.
format
(
reference_points
.
shape
[
-
1
]))
output
=
MSDeformAttnFunction
.
apply
(
value
,
input_spatial_shapes
,
input_level_start_index
,
sampling_locations
,
attention_weights
,
self
.
im2col_step
)
output
=
self
.
output_proj
(
output
)
return
output
projects_oss/detr/detr/runner.py
0 → 100644
View file @
f23248c0
#!/usr/bin/env python3
from
detr.d2
import
DetrDatasetMapper
,
add_detr_config
from
detectron2.solver.build
import
maybe_add_gradient_clipping
from
d2go.config
import
CfgNode
as
CN
from
d2go.runner
import
GeneralizedRCNNRunner
from
d2go.data.dataset_mappers.build
import
D2GO_DATA_MAPPER_REGISTRY
from
d2go.data.dataset_mappers.d2go_dataset_mapper
import
D2GoDatasetMapper
@
D2GO_DATA_MAPPER_REGISTRY
.
register
()
class
DETRDatasetMapper
(
DetrDatasetMapper
,
D2GoDatasetMapper
):
def
__init__
(
self
,
cfg
,
is_train
=
True
,
image_loader
=
None
,
tfm_gens
=
None
):
self
.
image_loader
=
None
self
.
backfill_size
=
False
self
.
retry
=
3
self
.
catch_exception
=
True
self
.
_error_count
=
0
self
.
_total_counts
=
0
self
.
_error_types
=
{}
super
().
__init__
(
cfg
,
is_train
)
def
_original_call
(
self
,
dataset_dict
):
return
DetrDatasetMapper
.
__call__
(
self
,
dataset_dict
)
def
__call__
(
self
,
dataset_dict
):
return
D2GoDatasetMapper
.
__call__
(
self
,
dataset_dict
)
class
DETRRunner
(
GeneralizedRCNNRunner
):
def
get_default_cfg
(
self
):
_C
=
super
().
get_default_cfg
()
add_detr_config
(
_C
)
_C
.
MODEL
.
DETR
=
CN
(
_C
.
MODEL
.
DETR
)
return
_C
# TODO rm this after update optimizer
@
classmethod
def
build_optimizer
(
cls
,
cfg
,
model
):
import
torch
import
itertools
from
typing
import
Any
,
Dict
,
List
,
Set
from
detectron2.solver.build
import
maybe_add_gradient_clipping
params
:
List
[
Dict
[
str
,
Any
]]
=
[]
memo
:
Set
[
torch
.
nn
.
parameter
.
Parameter
]
=
set
()
for
key
,
value
in
model
.
named_parameters
(
recurse
=
True
):
if
not
value
.
requires_grad
:
continue
# Avoid duplicating parameters
if
value
in
memo
:
continue
memo
.
add
(
value
)
lr
=
cfg
.
SOLVER
.
BASE_LR
weight_decay
=
cfg
.
SOLVER
.
WEIGHT_DECAY
if
"backbone.0"
in
key
:
lr
=
lr
*
0.1
#cfg.SOLVER.BACKBONE_MULTIPLIER
params
+=
[{
"params"
:
[
value
],
"lr"
:
lr
,
"weight_decay"
:
weight_decay
}]
def
maybe_add_full_model_gradient_clipping
(
optim
):
# optim: the optimizer class
# detectron2 doesn't have full model gradient clipping now
clip_norm_val
=
cfg
.
SOLVER
.
CLIP_GRADIENTS
.
CLIP_VALUE
enable
=
(
cfg
.
SOLVER
.
CLIP_GRADIENTS
.
ENABLED
and
cfg
.
SOLVER
.
CLIP_GRADIENTS
.
CLIP_TYPE
==
"full_model"
and
clip_norm_val
>
0.0
)
class
FullModelGradientClippingOptimizer
(
optim
):
def
step
(
self
,
closure
=
None
):
all_params
=
itertools
.
chain
(
*
[
x
[
"params"
]
for
x
in
self
.
param_groups
])
torch
.
nn
.
utils
.
clip_grad_norm_
(
all_params
,
clip_norm_val
)
super
().
step
(
closure
=
closure
)
return
FullModelGradientClippingOptimizer
if
enable
else
optim
optimizer_type
=
cfg
.
SOLVER
.
OPTIMIZER
if
optimizer_type
==
"SGD"
:
optimizer
=
maybe_add_full_model_gradient_clipping
(
torch
.
optim
.
SGD
)(
params
,
cfg
.
SOLVER
.
BASE_LR
,
momentum
=
cfg
.
SOLVER
.
MOMENTUM
)
elif
optimizer_type
==
"ADAMW"
:
optimizer
=
maybe_add_full_model_gradient_clipping
(
torch
.
optim
.
AdamW
)(
params
,
cfg
.
SOLVER
.
BASE_LR
)
else
:
raise
NotImplementedError
(
f
"no optimizer type
{
optimizer_type
}
"
)
if
not
cfg
.
SOLVER
.
CLIP_GRADIENTS
.
CLIP_TYPE
==
"full_model"
:
optimizer
=
maybe_add_gradient_clipping
(
cfg
,
optimizer
)
return
optimizer
Prev
1
2
3
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment