Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Pytorch-Encoding
Commits
b872eb8c
Unverified
Commit
b872eb8c
authored
Apr 17, 2020
by
Hang Zhang
Committed by
GitHub
Apr 17, 2020
Browse files
ResNeSt plus (#256)
parent
5a1e3fbc
Changes
101
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
975 additions
and
499 deletions
+975
-499
encoding/datasets/cityscapescoarse.py
encoding/datasets/cityscapescoarse.py
+158
-0
encoding/datasets/folder.py
encoding/datasets/folder.py
+70
-0
encoding/datasets/hpw18.py
encoding/datasets/hpw18.py
+104
-0
encoding/datasets/imagenet.py
encoding/datasets/imagenet.py
+3
-0
encoding/functions/__init__.py
encoding/functions/__init__.py
+2
-0
encoding/functions/dist_syncbn.py
encoding/functions/dist_syncbn.py
+106
-0
encoding/functions/rectify.py
encoding/functions/rectify.py
+47
-0
encoding/functions/syncbn.py
encoding/functions/syncbn.py
+1
-1
encoding/lib/__init__.py
encoding/lib/__init__.py
+2
-1
encoding/lib/cpu/operator.cpp
encoding/lib/cpu/operator.cpp
+1
-0
encoding/lib/cpu/operator.h
encoding/lib/cpu/operator.h
+9
-0
encoding/lib/cpu/rectify_cpu.cpp
encoding/lib/cpu/rectify_cpu.cpp
+234
-0
encoding/lib/gpu/activation_kernel.cu
encoding/lib/gpu/activation_kernel.cu
+2
-3
encoding/lib/gpu/encodingv2_kernel.cu
encoding/lib/gpu/encodingv2_kernel.cu
+0
-427
encoding/lib/gpu/operator.cpp
encoding/lib/gpu/operator.cpp
+1
-8
encoding/lib/gpu/operator.h
encoding/lib/gpu/operator.h
+10
-39
encoding/lib/gpu/rectify_cuda.cu
encoding/lib/gpu/rectify_cuda.cu
+222
-0
encoding/lib/gpu/setup.py
encoding/lib/gpu/setup.py
+1
-1
encoding/lib/gpu/syncbn_kernel.cu
encoding/lib/gpu/syncbn_kernel.cu
+1
-1
encoding/models/__init__.py
encoding/models/__init__.py
+1
-18
No files found.
encoding/datasets/cityscapescoarse.py
0 → 100644
View file @
b872eb8c
###########################################################################
# Created by: Hang Zhang
# Email: zhang.hang@rutgers.edu
# Copyright (c) 2017
###########################################################################
import
os
import
sys
import
numpy
as
np
import
random
import
math
from
tqdm
import
tqdm
from
PIL
import
Image
,
ImageOps
,
ImageFilter
import
torch
import
torch.utils.data
as
data
import
torchvision.transforms
as
transform
class
Segmentation
(
data
.
Dataset
):
BASE_DIR
=
'cityscapes'
def
__init__
(
self
,
data_folder
,
mode
=
'train'
,
transform
=
None
,
target_transform
=
None
):
self
.
root
=
os
.
path
.
join
(
data_folder
,
self
.
BASE_DIR
)
self
.
transform
=
transform
self
.
target_transform
=
target_transform
self
.
mode
=
mode
self
.
images
,
self
.
masks
=
get_city_pairs
(
self
.
root
,
mode
)
assert
(
len
(
self
.
images
)
==
len
(
self
.
masks
))
if
len
(
self
.
images
)
==
0
:
raise
(
RuntimeError
(
"Found 0 images in subfolders of:
\
"
+
self
.
root
+
"
\n
"
))
def
__getitem__
(
self
,
index
):
img
=
Image
.
open
(
self
.
images
[
index
]).
convert
(
'RGB'
)
if
self
.
mode
==
'test'
:
if
self
.
transform
is
not
None
:
img
=
self
.
transform
(
img
)
return
img
,
os
.
path
.
basename
(
self
.
images
[
index
])
mask
=
Image
.
open
(
self
.
masks
[
index
])
#.convert("P")
mask
=
np
.
array
(
mask
)
mask
+=
1
mask
[
mask
==
256
]
=
0
mask
=
Image
.
fromarray
(
mask
)
# synchrosized transform
if
self
.
mode
==
'train'
:
img
,
mask
=
self
.
_sync_transform
(
img
,
mask
)
elif
self
.
mode
==
'val'
:
img
,
mask
=
self
.
_val_sync_transform
(
img
,
mask
)
else
:
raise
RuntimeError
(
'unknown mode for dataloader: {}'
.
format
(
self
.
mode
))
# general resize, normalize and toTensor
if
self
.
transform
is
not
None
:
#print("transform for input")
img
=
self
.
transform
(
img
)
if
self
.
target_transform
is
not
None
:
#print("transform for label")
mask
=
self
.
target_transform
(
mask
)
return
img
,
mask
def
__len__
(
self
):
return
len
(
self
.
images
)
def
_val_sync_transform
(
self
,
img
,
mask
):
"""
synchronized transformation
"""
outsize
=
720
short
=
outsize
w
,
h
=
img
.
size
if
w
>
h
:
oh
=
short
ow
=
int
(
1.0
*
w
*
oh
/
h
)
else
:
ow
=
short
oh
=
int
(
1.0
*
h
*
ow
/
w
)
img
=
img
.
resize
((
ow
,
oh
),
Image
.
BILINEAR
)
mask
=
mask
.
resize
((
ow
,
oh
),
Image
.
NEAREST
)
# center crop
w
,
h
=
img
.
size
x1
=
int
(
round
((
w
-
outsize
)
/
2.
))
y1
=
int
(
round
((
h
-
outsize
)
/
2.
))
img
=
img
.
crop
((
x1
,
y1
,
x1
+
outsize
,
y1
+
outsize
))
mask
=
mask
.
crop
((
x1
,
y1
,
x1
+
outsize
,
y1
+
outsize
))
return
img
,
mask
def
_sync_transform
(
self
,
img
,
mask
):
# random mirror
if
random
.
random
()
<
0.5
:
img
=
img
.
transpose
(
Image
.
FLIP_LEFT_RIGHT
)
mask
=
mask
.
transpose
(
Image
.
FLIP_LEFT_RIGHT
)
base_size
=
2048
crop_size
=
720
# random scale (short edge from 480 to 720)
long_size
=
random
.
randint
(
int
(
base_size
*
0.5
),
int
(
base_size
*
2.0
))
w
,
h
=
img
.
size
if
h
>
w
:
oh
=
long_size
ow
=
int
(
1.0
*
w
*
oh
/
h
)
short_size
=
ow
else
:
ow
=
long_size
oh
=
int
(
1.0
*
h
*
ow
/
w
)
short_size
=
oh
img
=
img
.
resize
((
ow
,
oh
),
Image
.
BILINEAR
)
mask
=
mask
.
resize
((
ow
,
oh
),
Image
.
NEAREST
)
# random rotate -10~10, mask using NN rotate
deg
=
random
.
uniform
(
-
10
,
10
)
img
=
img
.
rotate
(
deg
,
resample
=
Image
.
BILINEAR
)
mask
=
mask
.
rotate
(
deg
,
resample
=
Image
.
NEAREST
)
# pad crop
if
short_size
<
crop_size
:
padh
=
crop_size
-
oh
if
oh
<
crop_size
else
0
padw
=
crop_size
-
ow
if
ow
<
crop_size
else
0
img
=
ImageOps
.
expand
(
img
,
border
=
(
0
,
0
,
padw
,
padh
),
fill
=
0
)
mask
=
ImageOps
.
expand
(
mask
,
border
=
(
0
,
0
,
padw
,
padh
),
fill
=
0
)
# random crop 480
w
,
h
=
img
.
size
x1
=
random
.
randint
(
0
,
w
-
crop_size
)
y1
=
random
.
randint
(
0
,
h
-
crop_size
)
img
=
img
.
crop
((
x1
,
y1
,
x1
+
crop_size
,
y1
+
crop_size
))
mask
=
mask
.
crop
((
x1
,
y1
,
x1
+
crop_size
,
y1
+
crop_size
))
# gaussian blur as in PSP ?
if
random
.
random
()
<
0.5
:
img
=
img
.
filter
(
ImageFilter
.
GaussianBlur
(
radius
=
random
.
random
()))
return
img
,
mask
def
get_city_pairs
(
folder
,
mode
=
'train'
):
img_paths
=
[]
mask_paths
=
[]
if
mode
==
'train'
:
img_folder
=
os
.
path
.
join
(
folder
,
'leftImg8bit/train_extra'
)
mask_folder
=
os
.
path
.
join
(
folder
,
'gtCoarse/train_extra'
)
else
:
img_folder
=
os
.
path
.
join
(
folder
,
'leftImg8bit/val'
)
mask_folder
=
os
.
path
.
join
(
folder
,
'gtFine/val'
)
for
root
,
directories
,
files
in
os
.
walk
(
img_folder
):
for
filename
in
files
:
basename
,
extension
=
os
.
path
.
splitext
(
filename
)
if
filename
.
endswith
(
".png"
):
imgpath
=
os
.
path
.
join
(
root
,
filename
)
foldername
=
os
.
path
.
basename
(
os
.
path
.
dirname
(
imgpath
))
maskname
=
filename
.
replace
(
'leftImg8bit'
,
'gtCoarse_trainIds'
)
maskpath
=
os
.
path
.
join
(
mask_folder
,
foldername
,
maskname
)
if
os
.
path
.
isfile
(
imgpath
)
and
os
.
path
.
isfile
(
maskpath
):
img_paths
.
append
(
imgpath
)
mask_paths
.
append
(
maskpath
)
else
:
print
(
'cannot find the mask or image:'
,
imgpath
,
maskpath
)
return
img_paths
,
mask_paths
encoding/datasets/folder.py
0 → 100644
View file @
b872eb8c
###########################################################################
# Created by: Hang Zhang
# Email: zhang.hang@rutgers.edu
# Copyright (c) 2017
###########################################################################
import
os
import
sys
import
numpy
as
np
import
random
import
math
import
torch.utils.data
as
data
from
PIL
import
Image
,
ImageOps
import
torch.utils.data
as
data
import
torchvision.transforms
as
transform
from
.dataset
import
ToLabel
class
FolderLoader
(
data
.
Dataset
):
def
__init__
(
self
,
root
,
transform
=
None
):
self
.
root
=
root
self
.
transform
=
transform
self
.
images
=
get_folder_images
(
root
)
if
len
(
self
.
images
)
==
0
:
raise
(
RuntimeError
(
"Found 0 images in subfolders of:
\
"
+
self
.
root
+
"
\n
"
))
def
__getitem__
(
self
,
index
):
img
=
Image
.
open
(
self
.
images
[
index
]).
convert
(
'RGB'
)
if
self
.
transform
is
not
None
:
img
=
self
.
transform
(
img
)
return
img
,
os
.
path
.
basename
(
self
.
images
[
index
])
def
__len__
(
self
):
return
len
(
self
.
images
)
def
get_folder_images
(
img_folder
):
img_paths
=
[]
for
filename
in
os
.
listdir
(
img_folder
):
if
filename
.
endswith
(
".jpg"
):
imgpath
=
os
.
path
.
join
(
img_folder
,
filename
)
img_paths
.
append
(
imgpath
)
return
img_paths
class
Dataloder
():
def
__init__
(
self
,
args
):
# the data augmentation is implemented as part of the dataloader
assert
(
args
.
test
)
input_transform
=
transform
.
Compose
([
transform
.
ToTensor
(),
transform
.
Normalize
(
args
.
mean
,
args
.
std
)])
args
.
test_batch_size
=
1
assert
(
args
.
test_folder
is
not
None
)
print
(
'loading the data from: {}'
.
format
(
args
.
test_folder
))
testset
=
FolderLoader
(
args
.
test_folder
,
input_transform
)
kwargs
=
{
'num_workers'
:
args
.
workers
,
'pin_memory'
:
True
}
\
if
args
.
cuda
else
{}
self
.
trainloader
=
None
self
.
testloader
=
data
.
DataLoader
(
testset
,
batch_size
=
args
.
test_batch_size
,
shuffle
=
False
,
**
kwargs
)
def
getloader
(
self
):
return
self
.
trainloader
,
self
.
testloader
encoding/datasets/hpw18.py
0 → 100644
View file @
b872eb8c
# created by: Sean Liu
# Amazon Lab 126
from
__future__
import
print_function
import
errno
import
hashlib
import
os
import
sys
import
tarfile
import
numpy
as
np
import
random
import
math
import
torch.utils.data
as
data
import
PIL
from
PIL
import
Image
,
ImageOps
from
six.moves
import
urllib
class
Segmentation_HPW18
(
data
.
Dataset
):
CLASSES
=
[
'background'
,
'hat'
,
'hair'
,
'sunglasses'
,
'upper-clothes'
,
'skirt'
,
'pants'
,
'dress'
,
'belt'
,
'left-shoe'
,
'right-shoe'
,
'face'
,
'left-leg'
,
'right-leg'
,
'left-arm'
,
'right-arm'
,
'bag'
,
'scarf'
]
URL
=
"/cvdata1/lliuqian/humanParsingDataset"
FILE
=
"hpw18.tar.gz"
MD5
=
''
BASE_DIR
=
''
def
__init__
(
self
,
root
,
train
=
True
,
transform
=
None
,
target_transform
=
None
,
download
=
False
):
self
.
root
=
root
_hpw18_root
=
os
.
path
.
join
(
self
.
root
,
self
.
BASE_DIR
)
_mask_dir
=
os
.
path
.
join
(
_hpw18_root
,
'SegmentationClassAug_256x384'
)
_image_dir
=
os
.
path
.
join
(
_hpw18_root
,
'JPEGImages_256x384'
)
self
.
transform
=
transform
self
.
target_transform
=
target_transform
self
.
train
=
train
if
download
:
self
.
_download
()
# train/val/test splits are pre-cut
_splits_dir
=
_hpw18_root
_split_f
=
os
.
path
.
join
(
_splits_dir
,
'humanparsingImageMask_256x384_absPath_train.txt'
)
if
not
self
.
train
:
_split_f
=
os
.
path
.
join
(
_splits_dir
,
'humanparsingImageMask_256x384_absPath_val.txt'
)
print
(
"reading from "
,
_split_f
)
self
.
images
=
[]
self
.
masks
=
[]
with
open
(
os
.
path
.
join
(
_split_f
),
"r"
)
as
lines
:
for
line
in
lines
:
s
=
line
.
split
()
_image
=
s
[
0
]
# image absolution path
_mask
=
s
[
1
]
# mask absolution path
assert
os
.
path
.
isfile
(
_image
)
assert
os
.
path
.
isfile
(
_mask
)
self
.
images
.
append
(
_image
)
self
.
masks
.
append
(
_mask
)
assert
(
len
(
self
.
images
)
==
len
(
self
.
masks
))
def
__getitem__
(
self
,
index
):
_img
=
Image
.
open
(
self
.
images
[
index
]).
convert
(
'RGB'
)
_timg
=
Image
.
open
(
self
.
masks
[
index
])
_target
=
np
.
array
(
_timg
,
dtype
=
np
.
uint8
)
_target
=
Image
.
fromarray
(
_target
)
# synchrosized transform
if
self
.
train
:
_img
,
_target
=
self
.
_sync_transform
(
_img
,
_target
)
# general resize, normalize and toTensor
if
self
.
transform
is
not
None
:
_img
=
self
.
transform
(
_img
)
if
self
.
target_transform
is
not
None
:
_target
=
self
.
target_transform
(
_target
)
return
_img
,
_target
def
__len__
(
self
):
return
len
(
self
.
images
)
def
_sync_transform
(
self
,
img
,
mask
):
# random rotate -10~10
deg
=
random
.
uniform
(
-
10
,
10
)
img
=
img
.
rotate
(
deg
)
mask
=
mask
.
rotate
(
deg
,
PIL
.
Image
.
NEAREST
)
return
img
,
mask
if
__name__
==
'__main__'
:
hpw18
=
Segmentation_HPW18
(
'/cvdata1/lliuqian/'
,
train
=
True
)
print
(
hpw18
[
0
])
print
(
len
(
hpw18
))
encoding/datasets/imagenet.py
View file @
b872eb8c
...
@@ -11,6 +11,9 @@ import os
...
@@ -11,6 +11,9 @@ import os
import
torchvision.transforms
as
transforms
import
torchvision.transforms
as
transforms
import
torchvision.datasets
as
datasets
import
torchvision.datasets
as
datasets
import
warnings
warnings
.
filterwarnings
(
"ignore"
,
"(Possibly )?corrupt EXIF data"
,
UserWarning
)
class
ImageNetDataset
(
datasets
.
ImageFolder
):
class
ImageNetDataset
(
datasets
.
ImageFolder
):
BASE_DIR
=
"ILSVRC2012"
BASE_DIR
=
"ILSVRC2012"
def
__init__
(
self
,
root
=
os
.
path
.
expanduser
(
'~/.encoding/data'
),
transform
=
None
,
def
__init__
(
self
,
root
=
os
.
path
.
expanduser
(
'~/.encoding/data'
),
transform
=
None
,
...
...
encoding/functions/__init__.py
View file @
b872eb8c
"""Encoding Autograd Fuctions"""
"""Encoding Autograd Fuctions"""
from
.encoding
import
*
from
.encoding
import
*
from
.syncbn
import
*
from
.syncbn
import
*
from
.dist_syncbn
import
dist_syncbatchnorm
from
.customize
import
*
from
.customize
import
*
from
.rectify
import
*
encoding/functions/dist_syncbn.py
0 → 100644
View file @
b872eb8c
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang
## Email: zhanghang0704@gmail.com
## Copyright (c) 2020
##
## LICENSE file in the root directory of this source tree
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
import
torch
from
torch.autograd.function
import
Function
from
..
import
lib
__all__
=
[
'dist_syncbatchnorm'
]
class
dist_syncbatchnorm_
(
Function
):
@
staticmethod
def
forward
(
ctx
,
x
,
gamma
,
beta
,
running_mean
,
running_var
,
eps
,
momentum
,
training
,
process_group
):
x
=
x
.
contiguous
()
ctx
.
training
=
training
ctx
.
momentum
=
momentum
ctx
.
eps
=
eps
ctx
.
process_group
=
process_group
if
not
ctx
.
training
:
_ex
,
_var
=
running_mean
.
contiguous
(),
running_var
.
contiguous
()
_exs
=
_var
+
_ex
**
2
if
x
.
is_cuda
:
y
=
lib
.
gpu
.
batchnorm_forward
(
x
,
_ex
,
_exs
,
gamma
,
beta
,
ctx
.
eps
)
else
:
y
=
lib
.
cpu
.
batchnorm_forward
(
x
,
_ex
,
_exs
,
gamma
,
beta
,
ctx
.
eps
)
ctx
.
save_for_backward
(
x
,
_ex
,
_exs
,
gamma
,
beta
)
return
y
size
=
x
.
numel
()
//
x
.
size
(
1
)
if
size
==
1
:
raise
ValueError
(
'Expected more than 1 value per channel when training, got input size {}'
.
format
(
size
))
if
x
.
is_cuda
:
_ex
,
_exs
=
lib
.
gpu
.
expectation_forward
(
x
)
else
:
raise
NotImplemented
count
=
torch
.
Tensor
([
1
]).
to
(
x
.
device
)
count_all_reduce
=
torch
.
distributed
.
all_reduce
(
count
,
group
=
process_group
,
async_op
=
True
)
_ex_all_reduce
=
torch
.
distributed
.
all_reduce
(
_ex
,
group
=
process_group
,
async_op
=
True
)
_exs_all_reduce
=
torch
.
distributed
.
all_reduce
(
_exs
,
group
=
process_group
,
async_op
=
True
)
count_all_reduce
.
wait
()
_ex_all_reduce
.
wait
()
_exs_all_reduce
.
wait
()
_ex
=
_ex
/
count
_exs
=
_exs
/
count
# Update running stats
_var
=
_exs
-
_ex
**
2
running_mean
.
mul_
((
1
-
ctx
.
momentum
)).
add_
(
ctx
.
momentum
*
_ex
)
running_var
.
mul_
((
1
-
ctx
.
momentum
)).
add_
(
ctx
.
momentum
*
_var
)
# Mark in-place modified tensors
ctx
.
mark_dirty
(
running_mean
,
running_var
)
# BN forward + activation
if
x
.
is_cuda
:
y
=
lib
.
gpu
.
batchnorm_forward
(
x
,
_ex
,
_exs
,
gamma
,
beta
,
ctx
.
eps
)
else
:
y
=
lib
.
cpu
.
batchnorm_forward
(
x
,
_ex
,
_exs
,
gamma
,
beta
,
ctx
.
eps
)
ctx
.
save_for_backward
(
x
,
_ex
,
_exs
,
gamma
,
beta
)
return
y
@
staticmethod
def
backward
(
ctx
,
dz
):
x
,
_ex
,
_exs
,
gamma
,
beta
=
ctx
.
saved_tensors
dz
=
dz
.
contiguous
()
# BN backward
if
dz
.
is_cuda
:
dx
,
_dex
,
_dexs
,
dgamma
,
dbeta
=
\
lib
.
gpu
.
batchnorm_backward
(
dz
,
x
,
_ex
,
_exs
,
gamma
,
beta
,
ctx
.
eps
)
else
:
raise
NotImplemented
if
ctx
.
training
:
process_group
=
ctx
.
process_group
count
=
torch
.
Tensor
([
1
]).
to
(
x
.
device
)
count_all_reduce
=
torch
.
distributed
.
all_reduce
(
count
,
group
=
process_group
,
async_op
=
True
)
_dex_all_reduce
=
torch
.
distributed
.
all_reduce
(
_dex
,
group
=
process_group
,
async_op
=
True
)
_dexs_all_reduce
=
torch
.
distributed
.
all_reduce
(
_dexs
,
group
=
process_group
,
async_op
=
True
)
count_all_reduce
.
wait
()
_dex_all_reduce
.
wait
()
_dexs_all_reduce
.
wait
()
_dex
=
_dex
/
count
_dexs
=
_dexs
/
count
if
x
.
is_cuda
:
dx_
=
lib
.
gpu
.
expectation_backward
(
x
,
_dex
,
_dexs
)
else
:
raise
NotImplemented
dx
=
dx
+
dx_
return
dx
,
dgamma
,
dbeta
,
None
,
None
,
None
,
None
,
None
,
None
dist_syncbatchnorm
=
dist_syncbatchnorm_
.
apply
encoding/functions/rectify.py
0 → 100644
View file @
b872eb8c
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang
## Email: zhanghang0704@gmail.com
## Copyright (c) 2020
##
## LICENSE file in the root directory of this source tree
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
"""Rectify function"""
import
torch
from
torch.autograd
import
Function
from
..
import
lib
__all__
=
[
'rectify'
]
class
_rectify
(
Function
):
@
staticmethod
def
forward
(
ctx
,
y
,
x
,
kernel_size
,
stride
,
padding
,
dilation
,
average
):
ctx
.
save_for_backward
(
x
)
# assuming kernel_size is 3
kernel_size
=
[
k
+
2
*
(
d
-
1
)
for
k
,
d
in
zip
(
kernel_size
,
dilation
)]
ctx
.
kernel_size
=
kernel_size
ctx
.
stride
=
stride
ctx
.
padding
=
padding
ctx
.
dilation
=
dilation
ctx
.
average
=
average
if
x
.
is_cuda
:
lib
.
gpu
.
conv_rectify
(
y
,
x
,
kernel_size
,
stride
,
padding
,
dilation
,
average
)
else
:
lib
.
cpu
.
conv_rectify
(
y
,
x
,
kernel_size
,
stride
,
padding
,
dilation
,
average
)
ctx
.
mark_dirty
(
y
)
return
y
@
staticmethod
def
backward
(
ctx
,
grad_y
):
x
,
=
ctx
.
saved_variables
if
x
.
is_cuda
:
lib
.
gpu
.
conv_rectify
(
grad_y
,
x
,
ctx
.
kernel_size
,
ctx
.
stride
,
ctx
.
padding
,
ctx
.
dilation
,
ctx
.
average
)
else
:
lib
.
cpu
.
conv_rectify
(
grad_y
,
x
,
ctx
.
kernel_size
,
ctx
.
stride
,
ctx
.
padding
,
ctx
.
dilation
,
ctx
.
average
)
ctx
.
mark_dirty
(
grad_y
)
return
grad_y
,
None
,
None
,
None
,
None
,
None
,
None
rectify
=
_rectify
.
apply
encoding/functions/syncbn.py
View file @
b872eb8c
...
@@ -10,7 +10,7 @@
...
@@ -10,7 +10,7 @@
"""Synchronized Cross-GPU Batch Normalization functions"""
"""Synchronized Cross-GPU Batch Normalization functions"""
import
torch
import
torch
import
torch.cuda.comm
as
comm
import
torch.cuda.comm
as
comm
from
torch.autograd
import
Variable
,
Function
from
torch.autograd
import
Function
from
torch.autograd.function
import
once_differentiable
from
torch.autograd.function
import
once_differentiable
from
..
import
lib
from
..
import
lib
...
...
encoding/lib/__init__.py
View file @
b872eb8c
...
@@ -12,6 +12,7 @@ cpu = load('enclib_cpu', [
...
@@ -12,6 +12,7 @@ cpu = load('enclib_cpu', [
os
.
path
.
join
(
cpu_path
,
'syncbn_cpu.cpp'
),
os
.
path
.
join
(
cpu_path
,
'syncbn_cpu.cpp'
),
os
.
path
.
join
(
cpu_path
,
'roi_align_cpu.cpp'
),
os
.
path
.
join
(
cpu_path
,
'roi_align_cpu.cpp'
),
os
.
path
.
join
(
cpu_path
,
'nms_cpu.cpp'
),
os
.
path
.
join
(
cpu_path
,
'nms_cpu.cpp'
),
os
.
path
.
join
(
cpu_path
,
'rectify_cpu.cpp'
),
],
build_directory
=
cpu_path
,
verbose
=
False
)
],
build_directory
=
cpu_path
,
verbose
=
False
)
if
torch
.
cuda
.
is_available
():
if
torch
.
cuda
.
is_available
():
...
@@ -19,9 +20,9 @@ if torch.cuda.is_available():
...
@@ -19,9 +20,9 @@ if torch.cuda.is_available():
os
.
path
.
join
(
gpu_path
,
'operator.cpp'
),
os
.
path
.
join
(
gpu_path
,
'operator.cpp'
),
os
.
path
.
join
(
gpu_path
,
'activation_kernel.cu'
),
os
.
path
.
join
(
gpu_path
,
'activation_kernel.cu'
),
os
.
path
.
join
(
gpu_path
,
'encoding_kernel.cu'
),
os
.
path
.
join
(
gpu_path
,
'encoding_kernel.cu'
),
os
.
path
.
join
(
gpu_path
,
'encodingv2_kernel.cu'
),
os
.
path
.
join
(
gpu_path
,
'syncbn_kernel.cu'
),
os
.
path
.
join
(
gpu_path
,
'syncbn_kernel.cu'
),
os
.
path
.
join
(
gpu_path
,
'roi_align_kernel.cu'
),
os
.
path
.
join
(
gpu_path
,
'roi_align_kernel.cu'
),
os
.
path
.
join
(
gpu_path
,
'nms_kernel.cu'
),
os
.
path
.
join
(
gpu_path
,
'nms_kernel.cu'
),
os
.
path
.
join
(
gpu_path
,
'rectify_cuda.cu'
),
],
extra_cuda_cflags
=
[
"--expt-extended-lambda"
],
],
extra_cuda_cflags
=
[
"--expt-extended-lambda"
],
build_directory
=
gpu_path
,
verbose
=
False
)
build_directory
=
gpu_path
,
verbose
=
False
)
encoding/lib/cpu/operator.cpp
View file @
b872eb8c
...
@@ -12,4 +12,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
...
@@ -12,4 +12,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m
.
def
(
"sumsquare_forward"
,
&
Sum_Square_Forward_CPU
,
"SumSqu forward (CPU)"
);
m
.
def
(
"sumsquare_forward"
,
&
Sum_Square_Forward_CPU
,
"SumSqu forward (CPU)"
);
m
.
def
(
"sumsquare_backward"
,
&
Sum_Square_Backward_CPU
,
"SumSqu backward (CPU)"
);
m
.
def
(
"sumsquare_backward"
,
&
Sum_Square_Backward_CPU
,
"SumSqu backward (CPU)"
);
m
.
def
(
"non_max_suppression"
,
&
Non_Max_Suppression_CPU
,
"NMS (CPU)"
);
m
.
def
(
"non_max_suppression"
,
&
Non_Max_Suppression_CPU
,
"NMS (CPU)"
);
m
.
def
(
"conv_rectify"
,
&
CONV_RECTIFY_CPU
,
"Convolution Rectifier (CPU)"
);
}
}
encoding/lib/cpu/operator.h
View file @
b872eb8c
...
@@ -72,3 +72,12 @@ std::vector<at::Tensor> Non_Max_Suppression_CPU(
...
@@ -72,3 +72,12 @@ std::vector<at::Tensor> Non_Max_Suppression_CPU(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
scores
,
const
at
::
Tensor
&
scores
,
double
thresh
);
double
thresh
);
void
CONV_RECTIFY_CPU
(
at
::
Tensor
&
output
,
const
at
::
Tensor
&
input
,
at
::
IntArrayRef
kernel_size
,
at
::
IntArrayRef
stride
,
at
::
IntArrayRef
padding
,
at
::
IntArrayRef
dilation
,
bool
avg_mode
);
encoding/lib/cpu/rectify_cpu.cpp
0 → 100644
View file @
b872eb8c
#include <ATen/ATen.h>
#include <ATen/Parallel.h>
#include <ATen/NativeFunctions.h>
#include <ATen/Parallel.h>
#include <tuple>
#include <torch/extension.h>
#include <ATen/div_rtn.h>
#include <ATen/TensorUtils.h>
#include <ATen/AccumulateType.h>
template
<
typename
dest_t
,
typename
src_t
>
static
inline
dest_t
safe_downcast
(
src_t
v
)
{
TORCH_CHECK
(
std
::
numeric_limits
<
dest_t
>::
min
()
<=
v
&&
v
<=
std
::
numeric_limits
<
dest_t
>::
max
(),
"integer out of range"
);
return
static_cast
<
dest_t
>
(
v
);
}
template
<
typename
T
>
static
inline
T
pooling_output_shape_pad_lr
(
T
inputSize
,
T
kernelSize
,
T
pad_l
,
T
pad_r
,
T
stride
,
T
dilation
,
bool
ceil_mode
)
{
T
outputSize
=
div_rtn
<
T
>
(
inputSize
+
pad_l
+
pad_r
-
dilation
*
(
kernelSize
-
1
)
-
1
+
(
ceil_mode
?
stride
-
1
:
0
),
stride
)
+
1
;
if
(
pad_l
)
{
// ensure that the last pooling starts inside the image
// needed to avoid problems in ceil mode
if
((
outputSize
-
1
)
*
stride
>=
inputSize
+
pad_l
)
--
outputSize
;
}
return
outputSize
;
}
template
<
typename
T
>
static
inline
T
pooling_output_shape
(
T
inputSize
,
T
kernelSize
,
T
pad
,
T
stride
,
T
dilation
,
bool
ceil_mode
)
{
return
pooling_output_shape_pad_lr
(
inputSize
,
kernelSize
,
pad
,
pad
,
stride
,
dilation
,
ceil_mode
);
}
static
inline
void
pool2d_shape_check
(
const
at
::
Tensor
&
input
,
int
kH
,
int
kW
,
int
dH
,
int
dW
,
int
padH
,
int
padW
,
int
dilationH
,
int
dilationW
,
int64_t
nInputPlane
,
int64_t
inputHeight
,
int64_t
inputWidth
,
int64_t
outputHeight
,
int64_t
outputWidth
)
{
const
int64_t
ndim
=
input
.
ndimension
();
const
int64_t
nOutputPlane
=
nInputPlane
;
TORCH_CHECK
(
kW
>
0
&&
kH
>
0
,
"kernel size should be greater than zero, but got "
,
"kH: "
,
kH
,
" kW: "
,
kW
);
TORCH_CHECK
(
dW
>
0
&&
dH
>
0
,
"stride should be greater than zero, but got "
"dH: "
,
dH
,
" dW: "
,
dW
);
TORCH_CHECK
(
dilationH
>
0
&&
dilationW
>
0
,
"dilation should be greater than zero, but got "
,
"dilationH: "
,
dilationH
,
" dilationW: "
,
dilationW
);
TORCH_CHECK
(
input
.
numel
()
>
0
&&
(
ndim
==
3
||
ndim
==
4
),
"non-empty 3D or 4D input tensor expected but got ndim: "
,
ndim
);
//TORCH_CHECK(kW/2 >= padW && kH/2 >= padH,
// "pad should be smaller than half of kernel size, but got ",
// "padW = ", padW, ", padH = ", padH, ", kW = ", kW, ", kH = ", kH);
TORCH_CHECK
(
outputWidth
>=
1
&&
outputHeight
>=
1
,
"Given input size: ("
,
nInputPlane
,
"x"
,
inputHeight
,
"x"
,
inputWidth
,
"). "
,
"Calculated output size: ("
,
nOutputPlane
,
"x"
,
outputHeight
,
"x"
,
outputWidth
,
"). "
,
"Output size is too small"
);
}
template
<
typename
scalar_t
>
static
void
conv_rectify_cpu_frame
(
scalar_t
*
output_data
,
int64_t
nbatch
,
int64_t
nInputPlane
,
int64_t
inputWidth
,
int64_t
inputHeight
,
int64_t
outputWidth
,
int64_t
outputHeight
,
int
kW
,
int
kH
,
int
dW
,
int
dH
,
int
padW
,
int
padH
,
const
int
dilation_h
,
const
int
dilation_w
,
bool
average_mode
)
{
//at::parallel_for(0, nInputPlane, 0, [&](int64_t start, int64_t end) {
for
(
int64_t
k
=
0
;
k
<
nInputPlane
;
k
++
)
{
int64_t
p
;
for
(
p
=
0
;
p
<
nbatch
;
p
++
)
{
int64_t
xx
,
yy
;
/* For all output pixels... */
scalar_t
*
ptr_output
=
output_data
+
p
*
nInputPlane
*
outputWidth
*
outputHeight
+
k
*
outputWidth
*
outputHeight
;
//int64_t i;
for
(
yy
=
0
;
yy
<
outputHeight
;
yy
++
)
{
for
(
xx
=
0
;
xx
<
outputWidth
;
xx
++
)
{
/* Compute the mean of the input image... */
int64_t
hstart
=
yy
*
dH
-
padH
;
int64_t
wstart
=
xx
*
dW
-
padW
;
int64_t
hend
=
std
::
min
(
hstart
+
kH
,
inputHeight
+
padH
);
int64_t
wend
=
std
::
min
(
wstart
+
kW
,
inputWidth
+
padW
);
//int pool_size = (hend - hstart) * (wend - wstart);
int
pool_size
=
((
kH
-
1
)
/
dilation_h
+
1
)
*
((
kW
-
1
)
/
dilation_w
+
1
);
hstart
=
std
::
max
(
hstart
,
(
int64_t
)
0
);
wstart
=
std
::
max
(
wstart
,
(
int64_t
)
0
);
hend
=
std
::
min
(
hend
,
inputHeight
);
wend
=
std
::
min
(
wend
,
inputWidth
);
int
hcount
=
int
(((
hend
-
hstart
)
-
1
)
/
dilation_h
+
1
);
int
wcount
=
int
(((
wend
-
wstart
)
-
1
)
/
dilation_w
+
1
);
scalar_t
mul_factor
;
if
(
average_mode
)
{
mul_factor
=
scalar_t
(
1.0
)
/
(
hcount
*
wcount
);
}
else
{
mul_factor
=
scalar_t
(
1.0
)
*
pool_size
/
(
hcount
*
wcount
);
}
*
ptr_output
++
*=
mul_factor
;
}
}
}
}
//});
}
void
conv_rectify_cpu_tempalte
(
at
::
Tensor
&
output
,
const
at
::
Tensor
&
input_
,
at
::
IntArrayRef
kernel_size
,
at
::
IntArrayRef
stride
,
at
::
IntArrayRef
padding
,
at
::
IntArrayRef
dilation
,
bool
average_mode
)
{
// #20866, #22032: Guarantee this for the official C++ API?
TORCH_CHECK
(
kernel_size
.
size
()
==
1
||
kernel_size
.
size
()
==
2
,
"conv_rectify: kernel_size must either be a single int, or a tuple of two ints"
);
const
int
kH
=
safe_downcast
<
int
,
int64_t
>
(
kernel_size
[
0
]);
const
int
kW
=
kernel_size
.
size
()
==
1
?
kH
:
safe_downcast
<
int
,
int64_t
>
(
kernel_size
[
1
]);
TORCH_CHECK
(
stride
.
empty
()
||
stride
.
size
()
==
1
||
stride
.
size
()
==
2
,
"conv_rectify: stride must either be omitted, a single int, or a tuple of two ints"
);
const
int
dH
=
stride
.
empty
()
?
kH
:
safe_downcast
<
int
,
int64_t
>
(
stride
[
0
]);
const
int
dW
=
stride
.
empty
()
?
kW
:
stride
.
size
()
==
1
?
dH
:
safe_downcast
<
int
,
int64_t
>
(
stride
[
1
]);
TORCH_CHECK
(
padding
.
size
()
==
1
||
padding
.
size
()
==
2
,
"conv_rectify: padding must either be a single int, or a tuple of two ints"
);
const
int
padH
=
safe_downcast
<
int
,
int64_t
>
(
padding
[
0
]);
const
int
padW
=
padding
.
size
()
==
1
?
padH
:
safe_downcast
<
int
,
int64_t
>
(
padding
[
1
]);
TORCH_CHECK
(
dilation
.
size
()
==
1
||
dilation
.
size
()
==
2
,
"rectify: dilation must either be a single int, or a tuple of two ints"
);
const
int
dilationH
=
safe_downcast
<
int
,
int64_t
>
(
dilation
[
0
]);
const
int
dilationW
=
dilation
.
size
()
==
1
?
dilationH
:
safe_downcast
<
int
,
int64_t
>
(
dilation
[
1
]);
TORCH_CHECK
((
input_
.
ndimension
()
==
3
||
input_
.
ndimension
()
==
4
),
"non-empty 2D or 3D (batch mode) tensor expected for input"
);
/* sizes */
const
int64_t
nbatch
=
input_
.
ndimension
()
==
4
?
input_
.
size
(
-
4
)
:
1
;
const
int64_t
nInputPlane
=
input_
.
size
(
-
3
);
const
int64_t
inputHeight
=
input_
.
size
(
-
2
);
const
int64_t
inputWidth
=
input_
.
size
(
-
1
);
//const int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, dilationH, false);
//const int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, dilationW, false);
const
int64_t
outputHeight
=
output
.
size
(
-
2
);
const
int64_t
outputWidth
=
output
.
size
(
-
1
);
pool2d_shape_check
(
input_
,
kH
,
kW
,
dH
,
dW
,
padH
,
padW
,
dilationH
,
dilationW
,
nInputPlane
,
inputHeight
,
inputWidth
,
outputHeight
,
outputWidth
);
TORCH_CHECK
(
output
.
is_contiguous
(),
"conv_rectify: output must be contiguous"
);
at
::
Tensor
input
=
input_
.
contiguous
();
AT_DISPATCH_FLOATING_TYPES
(
input
.
type
(),
"conv_rectify_cuda_frame"
,
([
&
]
{
scalar_t
*
output_data
=
output
.
data_ptr
<
scalar_t
>
();
conv_rectify_cpu_frame
<
scalar_t
>
(
output_data
,
nbatch
,
nInputPlane
,
inputWidth
,
inputHeight
,
outputWidth
,
outputHeight
,
kW
,
kH
,
dW
,
dH
,
padW
,
padH
,
dilationH
,
dilationW
,
average_mode
);
}
));
}
void
CONV_RECTIFY_CPU
(
at
::
Tensor
&
output
,
const
at
::
Tensor
&
input
,
at
::
IntArrayRef
kernel_size
,
at
::
IntArrayRef
stride
,
at
::
IntArrayRef
padding
,
at
::
IntArrayRef
dilation
,
bool
average
)
{
//at::Tensor output = at::empty({0}, input.options());
conv_rectify_cpu_tempalte
(
output
,
input
,
kernel_size
,
stride
,
padding
,
dilation
,
average
);
}
encoding/lib/gpu/activation_kernel.cu
View file @
b872eb8c
#include <vector>
#include <torch/extension.h>
#include <torch/extension.h>
#include <ATen/ATen.h>
#include <ATen/ATen.h>
//
#include <
ATen/cuda/CUDAContext.h
>
#include <
vector
>
#include <cuda_runtime_api.h>
#include <cuda_runtime_api.h>
#include <thrust/device_ptr.h>
#include <thrust/device_ptr.h>
#include <thrust/transform.h>
#include <thrust/transform.h>
#include "common.h"
namespace
{
namespace
{
...
...
encoding/lib/gpu/encodingv2_kernel.cu
deleted
100644 → 0
View file @
5a1e3fbc
#include <vector>
#include <torch/extension.h>
#include <ATen/ATen.h>
#include <ATen/Functions.h>
#include <ATen/cuda/CUDAContext.h>
#include "common.h"
#include "device_tensor.h"
namespace
{
template
<
typename
DType
,
typename
Acctype
>
struct
KD2Op
{
__device__
KD2Op
(
DeviceTensor
<
DType
,
3
>
x
,
DeviceTensor
<
DType
,
2
>
c
,
DeviceTensor
<
DType
,
2
>
std
)
:
X
(
x
),
C
(
c
),
STD
(
std
)
{}
__device__
__forceinline__
Acctype
operator
()(
int
b
,
int
i
,
int
k
,
int
d
)
{
DType
r
=
(
X
[
b
][
i
][
d
]
-
C
[
k
][
d
])
/
STD
[
k
][
d
];
return
ScalarConvert
<
DType
,
Acctype
>::
to
(
r
*
r
);
}
DeviceTensor
<
DType
,
3
>
X
;
DeviceTensor
<
DType
,
2
>
C
;
DeviceTensor
<
DType
,
2
>
STD
;
};
template
<
typename
DType
,
typename
Acctype
>
__global__
void
Encoding_Dist_Forward_kernel
(
DeviceTensor
<
DType
,
3
>
KD
,
DeviceTensor
<
DType
,
3
>
X
,
DeviceTensor
<
DType
,
2
>
C
,
DeviceTensor
<
DType
,
2
>
STD
)
{
/* declarations of the variables */
int
b
,
k
,
i
,
D
;
/* Get the index and channels */
b
=
blockIdx
.
z
;
k
=
blockIdx
.
x
;
i
=
blockIdx
.
y
;
D
=
X
.
getSize
(
2
);
/* main operation */
KD2Op
<
DType
,
Acctype
>
g
(
X
,
C
,
STD
);
KD
[
b
][
i
][
k
]
=
reduceD
<
Acctype
>
(
g
,
b
,
i
,
k
,
D
);;
}
template
<
typename
DType
,
typename
Acctype
>
struct
EncGradXOp
{
__device__
EncGradXOp
(
DeviceTensor
<
DType
,
3
>
gkd
,
DeviceTensor
<
DType
,
3
>
x
,
DeviceTensor
<
DType
,
2
>
c
,
DeviceTensor
<
DType
,
2
>
std
)
:
GKD
(
gkd
),
X
(
x
),
C
(
c
),
STD
(
std
)
{}
// DeviceTensor<DType, 1> s, S(s)
__device__
__forceinline__
Acctype
operator
()(
int
b
,
int
i
,
int
k
,
int
d
)
{
return
ScalarConvert
<
DType
,
Acctype
>::
to
(
2
*
GKD
[
b
][
i
][
k
]
*
(
X
[
b
][
i
][
d
]
-
C
[
k
][
d
])
/
(
STD
[
k
][
d
]
*
STD
[
k
][
d
]));
}
DeviceTensor
<
DType
,
3
>
GKD
;
DeviceTensor
<
DType
,
3
>
X
;
DeviceTensor
<
DType
,
2
>
C
;
DeviceTensor
<
DType
,
2
>
STD
;
// DeviceTensor<DType, 1> S;
};
template
<
typename
DType
,
typename
Acctype
>
__global__
void
Encoding_GradX_kernel
(
DeviceTensor
<
DType
,
3
>
GKD
,
DeviceTensor
<
DType
,
3
>
GX
,
DeviceTensor
<
DType
,
3
>
X
,
DeviceTensor
<
DType
,
2
>
C
,
DeviceTensor
<
DType
,
2
>
STD
)
{
// DeviceTensor<DType, 1> S
/* declarations of the variables */
int
b
,
d
,
i
,
K
;
/* Get the index and channels */
b
=
blockIdx
.
z
;
i
=
blockIdx
.
y
;
d
=
blockIdx
.
x
;
K
=
C
.
getSize
(
0
);
/* main operation */
EncGradXOp
<
DType
,
Acctype
>
g
(
GKD
,
X
,
C
,
STD
);
GX
[
b
][
i
][
d
]
=
reduceK
<
Acctype
>
(
g
,
b
,
i
,
d
,
K
);
}
template
<
typename
DType
,
typename
Acctype
>
struct
EncGradSTDOp
{
__device__
EncGradSTDOp
(
DeviceTensor
<
DType
,
3
>
gkd
,
DeviceTensor
<
DType
,
3
>
x
,
DeviceTensor
<
DType
,
2
>
c
,
DeviceTensor
<
DType
,
2
>
std
)
:
GKD
(
gkd
),
X
(
x
),
C
(
c
),
STD
(
std
)
{}
// DeviceTensor<DType, 1> s, S(s)
__device__
__forceinline__
Acctype
operator
()(
int
b
,
int
i
,
int
k
,
int
d
)
{
return
ScalarConvert
<
DType
,
Acctype
>::
to
(
-
2
*
GKD
[
b
][
i
][
k
]
*
(
X
[
b
][
i
][
d
]
-
C
[
k
][
d
])
*
(
X
[
b
][
i
][
d
]
-
C
[
k
][
d
])
/
(
STD
[
k
][
d
]
*
STD
[
k
][
d
]
*
STD
[
k
][
d
]));
}
DeviceTensor
<
DType
,
3
>
GKD
;
DeviceTensor
<
DType
,
3
>
X
;
DeviceTensor
<
DType
,
2
>
C
;
DeviceTensor
<
DType
,
2
>
STD
;
// DeviceTensor<DType, 1> S;
};
template
<
typename
DType
,
typename
Acctype
>
__global__
void
Encoding_GradCSTD_kernel
(
DeviceTensor
<
DType
,
3
>
GKD
,
DeviceTensor
<
DType
,
2
>
GC
,
DeviceTensor
<
DType
,
2
>
GSTD
,
DeviceTensor
<
DType
,
3
>
X
,
DeviceTensor
<
DType
,
2
>
C
,
DeviceTensor
<
DType
,
2
>
STD
)
{
/* declarations of the variables */
int
k
,
d
,
B
,
N
;
/* Get the index and channels */
d
=
blockIdx
.
x
;
k
=
blockIdx
.
y
;
B
=
X
.
getSize
(
0
);
N
=
X
.
getSize
(
1
);
/* main operation */
EncGradXOp
<
DType
,
Acctype
>
g1
(
GKD
,
X
,
C
,
STD
);
EncGradSTDOp
<
DType
,
Acctype
>
g2
(
GKD
,
X
,
C
,
STD
);
GC
[
k
][
d
]
=
-
reduceBN
<
Acctype
>
(
g1
,
k
,
d
,
B
,
N
);
GSTD
[
k
][
d
]
+=
reduceBN
<
Acctype
>
(
g2
,
k
,
d
,
B
,
N
);
}
template
<
typename
DType
,
typename
Acctype
>
struct
EncGradSTDXOp
{
__device__
EncGradSTDXOp
(
DeviceTensor
<
DType
,
2
>
gstd
,
DeviceTensor
<
DType
,
3
>
x
,
DeviceTensor
<
DType
,
2
>
c
,
DeviceTensor
<
DType
,
2
>
std
)
:
GSTD
(
gstd
),
X
(
x
),
C
(
c
),
STD
(
std
)
{}
__device__
__forceinline__
Acctype
operator
()(
int
b
,
int
i
,
int
k
,
int
d
)
{
return
ScalarConvert
<
DType
,
Acctype
>::
to
(
GSTD
[
k
][
d
]
*
(
X
[
b
][
i
][
d
]
-
C
[
k
][
d
])
/
STD
[
k
][
d
]);
}
DeviceTensor
<
DType
,
2
>
GSTD
;
DeviceTensor
<
DType
,
3
>
X
;
DeviceTensor
<
DType
,
2
>
C
;
DeviceTensor
<
DType
,
2
>
STD
;
};
template
<
typename
DType
,
typename
Acctype
>
__global__
void
Encoding_GradSTDX_kernel
(
DeviceTensor
<
DType
,
2
>
GSTD
,
DeviceTensor
<
DType
,
3
>
GX
,
DeviceTensor
<
DType
,
3
>
X
,
DeviceTensor
<
DType
,
2
>
C
,
DeviceTensor
<
DType
,
2
>
STD
,
int
N
)
{
/* declarations of the variables */
int
b
,
d
,
i
,
K
;
/* Get the index and channels */
b
=
blockIdx
.
z
;
i
=
blockIdx
.
y
;
d
=
blockIdx
.
x
;
K
=
C
.
getSize
(
0
);
/* main operation */
EncGradSTDXOp
<
DType
,
Acctype
>
g
(
GSTD
,
X
,
C
,
STD
);
GX
[
b
][
i
][
d
]
+=
reduceK
<
Acctype
>
(
g
,
b
,
i
,
d
,
K
)
/
N
;
}
template
<
typename
DType
,
typename
Acctype
>
struct
AggOpV2
{
__device__
AggOpV2
(
DeviceTensor
<
DType
,
3
>
a
,
DeviceTensor
<
DType
,
3
>
x
,
DeviceTensor
<
DType
,
2
>
c
,
DeviceTensor
<
DType
,
2
>
std
)
:
A
(
a
),
X
(
x
),
C
(
c
),
STD
(
std
)
{}
__device__
__forceinline__
Acctype
operator
()(
int
b
,
int
i
,
int
k
,
int
d
)
{
return
ScalarConvert
<
DType
,
Acctype
>::
to
(
A
[
b
][
i
][
k
]
*
(
X
[
b
][
i
][
d
]
-
C
[
k
][
d
])
/
STD
[
k
][
d
]);
}
DeviceTensor
<
DType
,
3
>
A
;
DeviceTensor
<
DType
,
3
>
X
;
DeviceTensor
<
DType
,
2
>
C
;
DeviceTensor
<
DType
,
2
>
STD
;
};
template
<
typename
DType
,
typename
Acctype
>
__global__
void
AggregateV2_Forward_kernel
(
DeviceTensor
<
DType
,
3
>
E
,
DeviceTensor
<
DType
,
3
>
A
,
DeviceTensor
<
DType
,
3
>
X
,
DeviceTensor
<
DType
,
2
>
C
,
DeviceTensor
<
DType
,
2
>
STD
)
{
/* declarations of the variables */
int
b
,
k
,
d
,
N
;
/* Get the index and channels */
b
=
blockIdx
.
z
;
d
=
blockIdx
.
x
;
k
=
blockIdx
.
y
;
N
=
X
.
getSize
(
1
);
/* main operation */
AggOpV2
<
DType
,
Acctype
>
g
(
A
,
X
,
C
,
STD
);
E
[
b
][
k
][
d
]
=
reduceN
<
Acctype
>
(
g
,
b
,
k
,
d
,
N
);
}
template
<
typename
DType
,
typename
Acctype
>
struct
AggV2BackOp
{
__device__
AggV2BackOp
(
DeviceTensor
<
DType
,
3
>
g
,
DeviceTensor
<
DType
,
3
>
x
,
DeviceTensor
<
DType
,
2
>
c
,
DeviceTensor
<
DType
,
2
>
std
)
:
G
(
g
),
X
(
x
),
C
(
c
),
STD
(
std
)
{}
__device__
__forceinline__
Acctype
operator
()(
int
b
,
int
i
,
int
k
,
int
d
)
{
return
ScalarConvert
<
DType
,
Acctype
>::
to
(
G
[
b
][
k
][
d
]
*
(
X
[
b
][
i
][
d
]
-
C
[
k
][
d
])
/
STD
[
k
][
d
]);
}
DeviceTensor
<
DType
,
3
>
G
;
DeviceTensor
<
DType
,
3
>
X
;
DeviceTensor
<
DType
,
2
>
C
;
DeviceTensor
<
DType
,
2
>
STD
;
};
template
<
typename
DType
,
typename
Acctype
>
__global__
void
AggregateV2_Backward_kernel
(
DeviceTensor
<
DType
,
3
>
GA
,
DeviceTensor
<
DType
,
3
>
GE
,
DeviceTensor
<
DType
,
3
>
A
,
DeviceTensor
<
DType
,
3
>
X
,
DeviceTensor
<
DType
,
2
>
C
,
DeviceTensor
<
DType
,
2
>
STD
)
{
/* declarations of the variables */
int
b
,
k
,
i
,
D
;
/* Get the index and channels */
b
=
blockIdx
.
z
;
i
=
blockIdx
.
y
;
k
=
blockIdx
.
x
;
D
=
GE
.
getSize
(
2
);
/* main operation */
AggV2BackOp
<
DType
,
Acctype
>
g
(
GE
,
X
,
C
,
STD
);
GA
[
b
][
i
][
k
]
=
reduceD
<
Acctype
>
(
g
,
b
,
i
,
k
,
D
);
}
}
// namespace
at
::
Tensor
Encoding_Dist_Inference_Forward_CUDA
(
const
at
::
Tensor
X_
,
const
at
::
Tensor
C_
,
const
at
::
Tensor
STD_
)
{
// const at::Tensor S_,
// X \in R^{B, N, D}, C \in R^{K, D}, S \in R^K
auto
KD_
=
torch
::
zeros
({
X_
.
size
(
0
),
X_
.
size
(
1
),
C_
.
size
(
0
)},
X_
.
options
());
// E(x), E(x^2)
int
N
=
X_
.
size
(
0
)
*
X_
.
size
(
1
);
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
dim3
blocks
(
C_
.
size
(
0
),
X_
.
size
(
1
),
X_
.
size
(
0
));
dim3
threads
(
getNumThreads
(
C_
.
size
(
1
)));
// calculate the kernel distance
AT_DISPATCH_FLOATING_TYPES
(
X_
.
type
(),
"Encoding_Dist_Inference_Forward_CUDA"
,
([
&
]
{
/* Device tensors */
DeviceTensor
<
scalar_t
,
3
>
KD
=
devicetensor
<
scalar_t
,
3
>
(
KD_
);
DeviceTensor
<
scalar_t
,
3
>
X
=
devicetensor
<
scalar_t
,
3
>
(
X_
);
DeviceTensor
<
scalar_t
,
2
>
C
=
devicetensor
<
scalar_t
,
2
>
(
C_
);
DeviceTensor
<
scalar_t
,
2
>
STD
=
devicetensor
<
scalar_t
,
2
>
(
STD_
);
/* kernel function */
Encoding_Dist_Forward_kernel
<
scalar_t
,
scalar_t
>
<<<
blocks
,
threads
,
0
,
stream
>>>
(
KD
,
X
,
C
,
STD
);
}));
AT_ASSERT
(
cudaGetLastError
()
==
cudaSuccess
);
return
KD_
;
}
std
::
vector
<
at
::
Tensor
>
Encoding_Dist_Inference_Backward_CUDA
(
const
at
::
Tensor
GKD_
,
const
at
::
Tensor
KD_
,
const
at
::
Tensor
X_
,
const
at
::
Tensor
C_
,
const
at
::
Tensor
STD_
)
{
auto
GX_
=
at
::
zeros_like
(
X_
);
auto
GC_
=
at
::
zeros_like
(
C_
);
auto
GSTD_
=
at
::
zeros_like
(
STD_
);
/* kernel function */
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
dim3
blocks1
(
X_
.
size
(
2
),
X_
.
size
(
1
),
X_
.
size
(
0
));
dim3
threads1
(
getNumThreads
(
C_
.
size
(
0
)));
dim3
blocks2
(
C_
.
size
(
1
),
C_
.
size
(
0
));
dim3
threads2
(
getNumThreads
(
X_
.
size
(
1
)));
int
N
=
X_
.
size
(
0
)
*
X_
.
size
(
1
);
AT_DISPATCH_FLOATING_TYPES
(
X_
.
type
(),
"Encoding_Dist_Backward_CUDA"
,
([
&
]
{
/* Device tensors */
DeviceTensor
<
scalar_t
,
3
>
GKD
=
devicetensor
<
scalar_t
,
3
>
(
GKD_
);
DeviceTensor
<
scalar_t
,
2
>
GSTD
=
devicetensor
<
scalar_t
,
2
>
(
GSTD_
);
DeviceTensor
<
scalar_t
,
3
>
GX
=
devicetensor
<
scalar_t
,
3
>
(
GX_
);
DeviceTensor
<
scalar_t
,
2
>
GC
=
devicetensor
<
scalar_t
,
2
>
(
GC_
);
DeviceTensor
<
scalar_t
,
3
>
X
=
devicetensor
<
scalar_t
,
3
>
(
X_
);
DeviceTensor
<
scalar_t
,
2
>
C
=
devicetensor
<
scalar_t
,
2
>
(
C_
);
DeviceTensor
<
scalar_t
,
2
>
STD
=
devicetensor
<
scalar_t
,
2
>
(
STD_
);
Encoding_GradX_kernel
<
scalar_t
,
scalar_t
>
<<<
blocks1
,
threads1
,
0
,
stream
>>>
(
GKD
,
GX
,
X
,
C
,
STD
);
AT_ASSERT
(
cudaGetLastError
()
==
cudaSuccess
);
Encoding_GradCSTD_kernel
<
scalar_t
,
scalar_t
>
<<<
blocks2
,
threads2
,
0
,
stream
>>>
(
GKD
,
GC
,
GSTD
,
X
,
C
,
STD
);
AT_ASSERT
(
cudaGetLastError
()
==
cudaSuccess
);
}));
return
{
GX_
,
GC_
,
GSTD_
};
}
std
::
vector
<
at
::
Tensor
>
Encoding_Dist_Forward_CUDA
(
const
at
::
Tensor
X_
,
const
at
::
Tensor
C_
,
double
eps
)
{
// const at::Tensor S_,
// X \in R^{B, N, D}, C \in R^{K, D}, S \in R^K
auto
KD_
=
torch
::
zeros
({
X_
.
size
(
0
),
X_
.
size
(
1
),
C_
.
size
(
0
)},
X_
.
options
());
// E(x), E(x^2)
int
N
=
X_
.
size
(
0
)
*
X_
.
size
(
1
);
auto
SVar_
=
(
X_
.
pow
(
2
).
sum
(
0
).
sum
(
0
).
view
({
1
,
X_
.
size
(
2
)})
-
2
*
C_
*
X_
.
sum
(
0
).
sum
(
0
).
view
({
1
,
X_
.
size
(
2
)})).
expand_as
(
C_
)
+
C_
.
pow
(
2
)
*
N
;
auto
STD_
=
at
::
sqrt
(
SVar_
/
N
+
eps
);
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
dim3
blocks
(
C_
.
size
(
0
),
X_
.
size
(
1
),
X_
.
size
(
0
));
dim3
threads
(
getNumThreads
(
C_
.
size
(
1
)));
// calculate the kernel distance
AT_DISPATCH_FLOATING_TYPES
(
X_
.
type
(),
"Encoding_Dist_Forward_CUDA"
,
([
&
]
{
/* Device tensors */
DeviceTensor
<
scalar_t
,
3
>
KD
=
devicetensor
<
scalar_t
,
3
>
(
KD_
);
DeviceTensor
<
scalar_t
,
3
>
X
=
devicetensor
<
scalar_t
,
3
>
(
X_
);
DeviceTensor
<
scalar_t
,
2
>
C
=
devicetensor
<
scalar_t
,
2
>
(
C_
);
DeviceTensor
<
scalar_t
,
2
>
STD
=
devicetensor
<
scalar_t
,
2
>
(
STD_
);
/* kernel function */
Encoding_Dist_Forward_kernel
<
scalar_t
,
scalar_t
>
<<<
blocks
,
threads
,
0
,
stream
>>>
(
KD
,
X
,
C
,
STD
);
}));
AT_ASSERT
(
cudaGetLastError
()
==
cudaSuccess
);
return
{
KD_
,
STD_
,
SVar_
/
(
N
-
1
)};
}
std
::
vector
<
at
::
Tensor
>
Encoding_Dist_Backward_CUDA
(
const
at
::
Tensor
GKD_
,
const
at
::
Tensor
GSTD_
,
const
at
::
Tensor
KD_
,
const
at
::
Tensor
X_
,
const
at
::
Tensor
C_
,
const
at
::
Tensor
STD_
)
{
auto
GX_
=
at
::
zeros_like
(
X_
);
auto
GC_
=
at
::
zeros_like
(
C_
);
auto
GSTD2_
=
GSTD_
.
clone
();
/* kernel function */
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
dim3
blocks1
(
X_
.
size
(
2
),
X_
.
size
(
1
),
X_
.
size
(
0
));
dim3
threads1
(
getNumThreads
(
C_
.
size
(
0
)));
dim3
blocks2
(
C_
.
size
(
1
),
C_
.
size
(
0
));
dim3
threads2
(
getNumThreads
(
X_
.
size
(
1
)));
int
N
=
X_
.
size
(
0
)
*
X_
.
size
(
1
);
AT_DISPATCH_FLOATING_TYPES
(
X_
.
type
(),
"Encoding_Dist_Backward_CUDA"
,
([
&
]
{
/* Device tensors */
DeviceTensor
<
scalar_t
,
3
>
GKD
=
devicetensor
<
scalar_t
,
3
>
(
GKD_
);
DeviceTensor
<
scalar_t
,
2
>
GSTD
=
devicetensor
<
scalar_t
,
2
>
(
GSTD2_
);
DeviceTensor
<
scalar_t
,
3
>
GX
=
devicetensor
<
scalar_t
,
3
>
(
GX_
);
DeviceTensor
<
scalar_t
,
2
>
GC
=
devicetensor
<
scalar_t
,
2
>
(
GC_
);
DeviceTensor
<
scalar_t
,
3
>
X
=
devicetensor
<
scalar_t
,
3
>
(
X_
);
DeviceTensor
<
scalar_t
,
2
>
C
=
devicetensor
<
scalar_t
,
2
>
(
C_
);
DeviceTensor
<
scalar_t
,
2
>
STD
=
devicetensor
<
scalar_t
,
2
>
(
STD_
);
Encoding_GradX_kernel
<
scalar_t
,
scalar_t
>
<<<
blocks1
,
threads1
,
0
,
stream
>>>
(
GKD
,
GX
,
X
,
C
,
STD
);
AT_ASSERT
(
cudaGetLastError
()
==
cudaSuccess
);
Encoding_GradCSTD_kernel
<
scalar_t
,
scalar_t
>
<<<
blocks2
,
threads2
,
0
,
stream
>>>
(
GKD
,
GC
,
GSTD
,
X
,
C
,
STD
);
AT_ASSERT
(
cudaGetLastError
()
==
cudaSuccess
);
Encoding_GradSTDX_kernel
<
scalar_t
,
scalar_t
>
<<<
blocks1
,
threads1
,
0
,
stream
>>>
(
GSTD
,
GX
,
X
,
C
,
STD
,
N
);
AT_ASSERT
(
cudaGetLastError
()
==
cudaSuccess
);
}));
// d_sigma/d_c
GC_
=
GC_
-
GSTD2_
*
(
X_
.
mean
(
0
).
mean
(
0
)
-
C_
)
/
STD_
;
return
{
GX_
,
GC_
};
}
at
::
Tensor
AggregateV2_Forward_CUDA
(
const
at
::
Tensor
A_
,
const
at
::
Tensor
X_
,
const
at
::
Tensor
C_
,
const
at
::
Tensor
STD_
)
{
/* Device tensors */
auto
E_
=
torch
::
zeros
({
A_
.
size
(
0
),
C_
.
size
(
0
),
C_
.
size
(
1
)},
A_
.
options
());
// auto IS_ = 1.0f / (S_ + eps).sqrt();
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
// B, K, D
dim3
blocks
(
C_
.
size
(
1
),
C_
.
size
(
0
),
X_
.
size
(
0
));
dim3
threads
(
getNumThreads
(
X_
.
size
(
1
)));
AT_DISPATCH_FLOATING_TYPES
(
A_
.
type
(),
"Aggregate_Forward_CUDA"
,
([
&
]
{
DeviceTensor
<
scalar_t
,
3
>
E
=
devicetensor
<
scalar_t
,
3
>
(
E_
);
DeviceTensor
<
scalar_t
,
3
>
A
=
devicetensor
<
scalar_t
,
3
>
(
A_
);
DeviceTensor
<
scalar_t
,
3
>
X
=
devicetensor
<
scalar_t
,
3
>
(
X_
);
DeviceTensor
<
scalar_t
,
2
>
C
=
devicetensor
<
scalar_t
,
2
>
(
C_
);
DeviceTensor
<
scalar_t
,
2
>
STD
=
devicetensor
<
scalar_t
,
2
>
(
STD_
);
/* kernel function */
AggregateV2_Forward_kernel
<
scalar_t
,
scalar_t
>
<<<
blocks
,
threads
,
0
,
stream
>>>
(
E
,
A
,
X
,
C
,
STD
);
}));
AT_ASSERT
(
cudaGetLastError
()
==
cudaSuccess
);
return
E_
;
}
std
::
vector
<
at
::
Tensor
>
AggregateV2_Backward_CUDA
(
const
at
::
Tensor
GE_
,
const
at
::
Tensor
E_
,
const
at
::
Tensor
A_
,
const
at
::
Tensor
X_
,
const
at
::
Tensor
C_
,
const
at
::
Tensor
STD_
)
{
auto
gradA_
=
at
::
zeros_like
(
A_
);
auto
gradX_
=
at
::
bmm
(
A_
,
(
GE_
/
STD_
.
unsqueeze
(
0
)));
auto
gradC_
=
-
(
A_
.
sum
(
1
).
unsqueeze
(
2
)
*
GE_
/
STD_
.
unsqueeze
(
0
)).
sum
(
0
);
auto
gradSTD_
=
-
(
GE_
*
E_
).
sum
(
0
)
/
STD_
;
// auto gradS_ = -0.5 * (GE_ * E_).sum(2).sum(0) / S_;
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
// B, K, D
dim3
blocks
(
C_
.
size
(
0
),
X_
.
size
(
1
),
X_
.
size
(
0
));
dim3
threads
(
getNumThreads
(
C_
.
size
(
1
)));
AT_DISPATCH_FLOATING_TYPES
(
A_
.
type
(),
"Aggregate_Backward_CUDA"
,
([
&
]
{
/* Device tensors */
DeviceTensor
<
scalar_t
,
3
>
GA
=
devicetensor
<
scalar_t
,
3
>
(
gradA_
);
DeviceTensor
<
scalar_t
,
3
>
GE
=
devicetensor
<
scalar_t
,
3
>
(
GE_
);
DeviceTensor
<
scalar_t
,
3
>
A
=
devicetensor
<
scalar_t
,
3
>
(
A_
);
DeviceTensor
<
scalar_t
,
3
>
X
=
devicetensor
<
scalar_t
,
3
>
(
X_
);
DeviceTensor
<
scalar_t
,
2
>
C
=
devicetensor
<
scalar_t
,
2
>
(
C_
);
DeviceTensor
<
scalar_t
,
2
>
STD
=
devicetensor
<
scalar_t
,
2
>
(
STD_
);
AggregateV2_Backward_kernel
<
scalar_t
,
scalar_t
>
<<<
blocks
,
threads
,
0
,
stream
>>>
(
GA
,
GE
,
A
,
X
,
C
,
STD
);
}));
AT_ASSERT
(
cudaGetLastError
()
==
cudaSuccess
);
return
{
gradA_
,
gradX_
,
gradC_
,
gradSTD_
};
}
encoding/lib/gpu/operator.cpp
View file @
b872eb8c
...
@@ -16,14 +16,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
...
@@ -16,14 +16,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m
.
def
(
"expectation_backward"
,
&
Expectation_Backward_CUDA
,
"Expectation backward (CUDA)"
);
m
.
def
(
"expectation_backward"
,
&
Expectation_Backward_CUDA
,
"Expectation backward (CUDA)"
);
m
.
def
(
"expectation_inp_backward"
,
&
Expectation_Inp_Backward_CUDA
,
m
.
def
(
"expectation_inp_backward"
,
&
Expectation_Inp_Backward_CUDA
,
"Inplace Expectation backward (CUDA)"
);
"Inplace Expectation backward (CUDA)"
);
m
.
def
(
"encoding_dist_forward"
,
&
Encoding_Dist_Forward_CUDA
,
"EncDist forward (CUDA)"
);
m
.
def
(
"encoding_dist_backward"
,
&
Encoding_Dist_Backward_CUDA
,
"Assign backward (CUDA)"
);
m
.
def
(
"encoding_dist_inference_forward"
,
&
Encoding_Dist_Inference_Forward_CUDA
,
"EncDist Inference forward (CUDA)"
);
m
.
def
(
"encoding_dist_inference_backward"
,
&
Encoding_Dist_Inference_Backward_CUDA
,
"Assign Inference backward (CUDA)"
);
m
.
def
(
"aggregatev2_forward"
,
&
AggregateV2_Forward_CUDA
,
"AggregateV2 forward (CUDA)"
);
m
.
def
(
"aggregatev2_backward"
,
&
AggregateV2_Backward_CUDA
,
"AggregateV2 backward (CUDA)"
);
m
.
def
(
"leaky_relu_forward"
,
&
LeakyRelu_Forward_CUDA
,
"Learky ReLU forward (CUDA)"
);
m
.
def
(
"leaky_relu_forward"
,
&
LeakyRelu_Forward_CUDA
,
"Learky ReLU forward (CUDA)"
);
m
.
def
(
"leaky_relu_backward"
,
&
LeakyRelu_Backward_CUDA
,
"Learky ReLU backward (CUDA)"
);
m
.
def
(
"leaky_relu_backward"
,
&
LeakyRelu_Backward_CUDA
,
"Learky ReLU backward (CUDA)"
);
m
.
def
(
"conv_rectify"
,
&
CONV_RECTIFY_CUDA
,
"Convolution Rectifier (CUDA)"
);
}
}
encoding/lib/gpu/operator.h
View file @
b872eb8c
#include <torch/extension.h>
#include <torch/extension.h>
#include <ATen/ATen.h>
#include <vector>
#include <vector>
at
::
Tensor
ROIAlign_Forward_CUDA
(
at
::
Tensor
ROIAlign_Forward_CUDA
(
...
@@ -102,45 +103,15 @@ at::Tensor Expectation_Inp_Backward_CUDA(
...
@@ -102,45 +103,15 @@ at::Tensor Expectation_Inp_Backward_CUDA(
const
at
::
Tensor
beta_
,
const
at
::
Tensor
beta_
,
float
eps
);
float
eps
);
at
::
Tensor
Encoding_Dist_Inference_Forward_CUDA
(
const
at
::
Tensor
X_
,
const
at
::
Tensor
C_
,
const
at
::
Tensor
STD_
);
std
::
vector
<
at
::
Tensor
>
Encoding_Dist_Inference_Backward_CUDA
(
const
at
::
Tensor
GKD_
,
const
at
::
Tensor
KD_
,
const
at
::
Tensor
X_
,
const
at
::
Tensor
C_
,
const
at
::
Tensor
STD_
);
std
::
vector
<
at
::
Tensor
>
Encoding_Dist_Forward_CUDA
(
const
at
::
Tensor
X
,
const
at
::
Tensor
C
,
double
eps
);
std
::
vector
<
at
::
Tensor
>
Encoding_Dist_Backward_CUDA
(
const
at
::
Tensor
GKD_
,
const
at
::
Tensor
GSTD_
,
const
at
::
Tensor
KD_
,
const
at
::
Tensor
X_
,
const
at
::
Tensor
C_
,
const
at
::
Tensor
STD_
);
at
::
Tensor
AggregateV2_Forward_CUDA
(
const
at
::
Tensor
A_
,
const
at
::
Tensor
X_
,
const
at
::
Tensor
C_
,
const
at
::
Tensor
STD_
);
std
::
vector
<
at
::
Tensor
>
AggregateV2_Backward_CUDA
(
const
at
::
Tensor
GE_
,
const
at
::
Tensor
E_
,
const
at
::
Tensor
A_
,
const
at
::
Tensor
X_
,
const
at
::
Tensor
C_
,
const
at
::
Tensor
STD_
);
void
LeakyRelu_Forward_CUDA
(
at
::
Tensor
z
,
float
slope
);
void
LeakyRelu_Forward_CUDA
(
at
::
Tensor
z
,
float
slope
);
void
LeakyRelu_Backward_CUDA
(
at
::
Tensor
z
,
at
::
Tensor
dz
,
float
slope
);
void
LeakyRelu_Backward_CUDA
(
at
::
Tensor
z
,
at
::
Tensor
dz
,
float
slope
);
void
CONV_RECTIFY_CUDA
(
at
::
Tensor
&
output
,
const
at
::
Tensor
&
input
,
at
::
IntArrayRef
kernel_size
,
at
::
IntArrayRef
stride
,
at
::
IntArrayRef
padding
,
at
::
IntArrayRef
dilation
,
bool
avg_mode
);
encoding/lib/gpu/rectify_cuda.cu
0 → 100644
View file @
b872eb8c
#include <torch/extension.h>
#include <ATen/ATen.h>
#include <ATen/div_rtn.h>
#include <ATen/Dispatch.h>
#include <ATen/TensorUtils.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
#include <ATen/cuda/detail/TensorInfo.cuh>
#include <ATen/cuda/detail/IndexUtils.cuh>
#include <ATen/cuda/detail/KernelUtils.h>
template
<
typename
dest_t
,
typename
src_t
>
static
inline
dest_t
safe_downcast
(
src_t
v
)
{
TORCH_CHECK
(
std
::
numeric_limits
<
dest_t
>::
min
()
<=
v
&&
v
<=
std
::
numeric_limits
<
dest_t
>::
max
(),
"integer out of range"
);
return
static_cast
<
dest_t
>
(
v
);
}
template
<
typename
T
>
static
inline
T
pooling_output_shape_pad_lr
(
T
inputSize
,
T
kernelSize
,
T
pad_l
,
T
pad_r
,
T
stride
,
T
dilation
,
bool
ceil_mode
)
{
T
outputSize
=
div_rtn
<
T
>
(
inputSize
+
pad_l
+
pad_r
-
dilation
*
(
kernelSize
-
1
)
-
1
+
(
ceil_mode
?
stride
-
1
:
0
),
stride
)
+
1
;
if
(
pad_l
)
{
// ensure that the last pooling starts inside the image
// needed to avoid problems in ceil mode
if
((
outputSize
-
1
)
*
stride
>=
inputSize
+
pad_l
)
--
outputSize
;
}
return
outputSize
;
}
template
<
typename
T
>
static
inline
T
pooling_output_shape
(
T
inputSize
,
T
kernelSize
,
T
pad
,
T
stride
,
T
dilation
,
bool
ceil_mode
)
{
return
pooling_output_shape_pad_lr
(
inputSize
,
kernelSize
,
pad
,
pad
,
stride
,
dilation
,
ceil_mode
);
}
static
inline
void
pool2d_shape_check
(
const
at
::
Tensor
&
input
,
int
kH
,
int
kW
,
int
dH
,
int
dW
,
int
padH
,
int
padW
,
int
dilationH
,
int
dilationW
,
int64_t
nInputPlane
,
int64_t
inputHeight
,
int64_t
inputWidth
,
int64_t
outputHeight
,
int64_t
outputWidth
)
{
const
int64_t
ndim
=
input
.
ndimension
();
const
int64_t
nOutputPlane
=
nInputPlane
;
TORCH_CHECK
(
kW
>
0
&&
kH
>
0
,
"kernel size should be greater than zero, but got "
,
"kH: "
,
kH
,
" kW: "
,
kW
);
TORCH_CHECK
(
dW
>
0
&&
dH
>
0
,
"stride should be greater than zero, but got "
"dH: "
,
dH
,
" dW: "
,
dW
);
TORCH_CHECK
(
dilationH
>
0
&&
dilationW
>
0
,
"dilation should be greater than zero, but got "
,
"dilationH: "
,
dilationH
,
" dilationW: "
,
dilationW
);
TORCH_CHECK
(
input
.
numel
()
>
0
&&
(
ndim
==
3
||
ndim
==
4
),
"non-empty 3D or 4D input tensor expected but got ndim: "
,
ndim
);
//TORCH_CHECK(kW/2 >= padW && kH/2 >= padH,
// "pad should be smaller than half of kernel size, but got ",
// "padW = ", padW, ", padH = ", padH, ", kW = ", kW, ", kH = ", kH);
TORCH_CHECK
(
outputWidth
>=
1
&&
outputHeight
>=
1
,
"Given input size: ("
,
nInputPlane
,
"x"
,
inputHeight
,
"x"
,
inputWidth
,
"). "
,
"Calculated output size: ("
,
nOutputPlane
,
"x"
,
outputHeight
,
"x"
,
outputWidth
,
"). "
,
"Output size is too small"
);
}
template
<
typename
scalar_t
,
typename
accscalar_t
>
__global__
void
conv_rectify_cuda_frame
(
const
int
nthreads
,
//const scalar_t* const bottom_data,
const
int
num
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
scalar_t
*
const
top_data
,
bool
average_mode
)
{
CUDA_KERNEL_LOOP
(
index
,
nthreads
)
{
const
int
pw
=
index
%
pooled_width
;
const
int
ph
=
(
index
/
pooled_width
)
%
pooled_height
;
//const int c = (index / pooled_width / pooled_height) % channels;
//const int n = index / pooled_width / pooled_height / channels;
int
hstart
=
ph
*
stride_h
-
pad_h
;
int
wstart
=
pw
*
stride_w
-
pad_w
;
int
hend
=
min
(
hstart
+
kernel_h
,
height
+
pad_h
);
int
wend
=
min
(
wstart
+
kernel_w
,
width
+
pad_w
);
const
int
pool_size
=
((
kernel_h
-
1
)
/
dilation_h
+
1
)
*
((
kernel_w
-
1
)
/
dilation_w
+
1
);
hstart
=
max
(
hstart
,
0
);
wstart
=
max
(
wstart
,
0
);
hend
=
min
(
hend
,
height
);
wend
=
min
(
wend
,
width
);
accscalar_t
mul_factor
;
int
hcount
=
int
(((
hend
-
hstart
)
-
1
)
/
dilation_h
+
1
);
int
wcount
=
int
(((
wend
-
wstart
)
-
1
)
/
dilation_w
+
1
);
if
(
average_mode
)
{
mul_factor
=
accscalar_t
(
1.0
)
/
(
hcount
*
wcount
);
}
else
{
mul_factor
=
accscalar_t
(
1.0
)
*
pool_size
/
(
hcount
*
wcount
);
}
top_data
[
index
]
=
ScalarConvert
<
accscalar_t
,
scalar_t
>::
to
(
top_data
[
index
]
*
mul_factor
);
}
}
void
conv_rectify_cuda_tempalte
(
at
::
Tensor
&
output
,
const
at
::
Tensor
&
input_
,
at
::
IntArrayRef
kernel_size
,
at
::
IntArrayRef
stride
,
at
::
IntArrayRef
padding
,
at
::
IntArrayRef
dilation
,
bool
average
)
{
//at::TensorArg output_arg{ output, "output", 1 };
//at::TensorArg input_arg{ input_, "input_", 2 };
//checkAllSameGPU("rectify_out_cuda", {output_arg, input_arg});
// #20866, #22032: Guarantee this for the official C++ API?
TORCH_CHECK
(
kernel_size
.
size
()
==
1
||
kernel_size
.
size
()
==
2
,
"rectify: kernel_size must either be a single int, or a tuple of two ints"
);
const
int
kH
=
safe_downcast
<
int
,
int64_t
>
(
kernel_size
[
0
]);
const
int
kW
=
kernel_size
.
size
()
==
1
?
kH
:
safe_downcast
<
int
,
int64_t
>
(
kernel_size
[
1
]);
TORCH_CHECK
(
stride
.
empty
()
||
stride
.
size
()
==
1
||
stride
.
size
()
==
2
,
"rectify: stride must either be omitted, a single int, or a tuple of two ints"
);
const
int
dH
=
stride
.
empty
()
?
kH
:
safe_downcast
<
int
,
int64_t
>
(
stride
[
0
]);
const
int
dW
=
stride
.
empty
()
?
kW
:
stride
.
size
()
==
1
?
dH
:
safe_downcast
<
int
,
int64_t
>
(
stride
[
1
]);
TORCH_CHECK
(
padding
.
size
()
==
1
||
padding
.
size
()
==
2
,
"rectify: padding must either be a single int, or a tuple of two ints"
);
const
int
padH
=
safe_downcast
<
int
,
int64_t
>
(
padding
[
0
]);
const
int
padW
=
padding
.
size
()
==
1
?
padH
:
safe_downcast
<
int
,
int64_t
>
(
padding
[
1
]);
TORCH_CHECK
(
dilation
.
size
()
==
1
||
dilation
.
size
()
==
2
,
"rectify: dilation must either be a single int, or a tuple of two ints"
);
const
int
dilationH
=
safe_downcast
<
int
,
int64_t
>
(
dilation
[
0
]);
const
int
dilationW
=
dilation
.
size
()
==
1
?
dilationH
:
safe_downcast
<
int
,
int64_t
>
(
dilation
[
1
]);
TORCH_CHECK
((
input_
.
ndimension
()
==
3
||
input_
.
ndimension
()
==
4
),
"non-empty 3D or 4D (batch mode) tensor expected for input"
);
const
int64_t
nbatch
=
input_
.
ndimension
()
==
4
?
input_
.
size
(
-
4
)
:
1
;
const
int64_t
nInputPlane
=
input_
.
size
(
-
3
);
const
int64_t
inputHeight
=
input_
.
size
(
-
2
);
const
int64_t
inputWidth
=
input_
.
size
(
-
1
);
//const int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, dilationH, false);
//const int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, dilationW, false);
const
int64_t
outputHeight
=
output
.
size
(
-
2
);
const
int64_t
outputWidth
=
output
.
size
(
-
1
);
pool2d_shape_check
(
input_
,
kH
,
kW
,
dH
,
dW
,
padH
,
padW
,
dilationH
,
dilationW
,
nInputPlane
,
inputHeight
,
inputWidth
,
outputHeight
,
outputWidth
);
at
::
Tensor
input
=
input_
.
contiguous
();
//output.resize_({nbatch, nInputPlane, outputHeight, outputWidth});
const
int32_t
count
=
safe_downcast
<
int32_t
,
int64_t
>
(
output
.
numel
());
const
uint32_t
num_threads
=
std
::
min
(
at
::
cuda
::
getCurrentDeviceProperties
()
->
maxThreadsPerBlock
,
1024
);
const
uint32_t
num_blocks
=
at
::
cuda
::
ATenCeilDiv
<
uint32_t
>
(
count
,
num_threads
);
AT_DISPATCH_FLOATING_TYPES
(
input
.
type
(),
"conv_rectify_cuda_frame"
,
([
&
]
{
//using accscalar_t = acc_type<scalar_t, true>;
scalar_t
*
output_data
=
output
.
data_ptr
<
scalar_t
>
();
conv_rectify_cuda_frame
<
scalar_t
,
scalar_t
>
<<<
num_blocks
,
num_threads
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
count
,
nbatch
,
nInputPlane
,
inputHeight
,
inputWidth
,
outputHeight
,
outputWidth
,
kH
,
kW
,
dH
,
dW
,
padH
,
padW
,
dilationH
,
dilationW
,
output_data
,
average
);
}));
AT_CUDA_CHECK
(
cudaGetLastError
());
}
void
CONV_RECTIFY_CUDA
(
at
::
Tensor
&
output
,
const
at
::
Tensor
&
input
,
at
::
IntArrayRef
kernel_size
,
at
::
IntArrayRef
stride
,
at
::
IntArrayRef
padding
,
at
::
IntArrayRef
dilation
,
bool
average
)
{
//at::Tensor output = at::empty({0}, input.options());
conv_rectify_cuda_tempalte
(
output
,
input
,
kernel_size
,
stride
,
padding
,
dilation
,
average
);
}
encoding/lib/gpu/setup.py
View file @
b872eb8c
...
@@ -8,10 +8,10 @@ setup(
...
@@ -8,10 +8,10 @@ setup(
'operator.cpp'
,
'operator.cpp'
,
'activation_kernel.cu'
,
'activation_kernel.cu'
,
'encoding_kernel.cu'
,
'encoding_kernel.cu'
,
'encodingv2_kernel.cu'
,
'syncbn_kernel.cu'
,
'syncbn_kernel.cu'
,
'roi_align_kernel.cu'
,
'roi_align_kernel.cu'
,
'nms_kernel.cu'
,
'nms_kernel.cu'
,
'rectify.cu'
,
]),
]),
],
],
cmdclass
=
{
cmdclass
=
{
...
...
encoding/lib/gpu/syncbn_kernel.cu
View file @
b872eb8c
#include <vector>
#include <torch/extension.h>
#include <torch/extension.h>
#include <ATen/ATen.h>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAContext.h>
#include <vector>
#include "common.h"
#include "common.h"
#include "device_tensor.h"
#include "device_tensor.h"
...
...
encoding/models/__init__.py
View file @
b872eb8c
from
.model_zoo
import
get_model
from
.model_zoo
import
get_model
from
.model_store
import
get_model_file
from
.model_store
import
get_model_file
from
.resnet
import
*
from
.cifarresnet
import
*
from
.base
import
*
from
.fcn
import
*
from
.psp
import
*
from
.encnet
import
*
from
.deeplab
import
*
def
get_segmentation_model
(
name
,
**
kwargs
):
from
.sseg
import
get_segmentation_model
,
MultiEvalModule
from
.fcn
import
get_fcn
models
=
{
'fcn'
:
get_fcn
,
'psp'
:
get_psp
,
'atten'
:
get_atten
,
'encnet'
:
get_encnet
,
'encnetv2'
:
get_encnetv2
,
'deeplab'
:
get_deeplab
,
}
return
models
[
name
.
lower
()](
**
kwargs
)
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment