Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
apex
Commits
fa8e7d99
Commit
fa8e7d99
authored
Apr 03, 2022
by
Thor Johnsen
Browse files
Clean up code
parent
05dd9c69
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
1884 additions
and
1773 deletions
+1884
-1773
apex/contrib/csrc/bottleneck/bottleneck.cpp
apex/contrib/csrc/bottleneck/bottleneck.cpp
+1883
-1772
apex/contrib/csrc/peer_memory/peer_memory_cuda.cu
apex/contrib/csrc/peer_memory/peer_memory_cuda.cu
+1
-1
No files found.
apex/contrib/csrc/bottleneck/bottleneck.cpp
View file @
fa8e7d99
...
@@ -102,13 +102,6 @@ enum {
...
@@ -102,13 +102,6 @@ enum {
AFTERCONV_TENSOR
,
AFTERCONV_TENSOR
,
OPTIONAL
,
OPTIONAL
,
AFTEROPT_TENSOR
,
AFTEROPT_TENSOR
,
AFTERACT_TENSOR
,
GEN_INDEX_TENSOR
,
MASK_TOP_TENSOR
,
MASK_BOTTOM_TENSOR
,
MASK_TENSOR
,
THRESHOLD_TOP_TENSOR
,
THRESHOLD_BOTTOM_TENSOR
,
};
};
using
common_conv_descriptors
=
using
common_conv_descriptors
=
...
@@ -180,11 +173,11 @@ using common_convbias_descriptors = std::tuple<cudnn_frontend::Tensor,
...
@@ -180,11 +173,11 @@ using common_convbias_descriptors = std::tuple<cudnn_frontend::Tensor,
common_convbias_descriptors
common_convbias_descriptors
create_conv_bias_add_act_descriptors
(
int64_t
*
x_dim_padded
,
create_conv_bias_add_act_descriptors
(
int64_t
*
x_dim_padded
,
int64_t
*
padA
,
int64_t
*
padA
,
int64_t
*
convstrideA
,
int64_t
*
convstrideA
,
int64_t
*
dilationA
,
int64_t
*
dilationA
,
int64_t
*
w_dim_padded
,
int64_t
*
w_dim_padded
,
int64_t
*
y_dim_padded
,
int64_t
*
y_dim_padded
,
cudnnDataType_t
dataType
)
{
cudnnDataType_t
dataType
)
{
const
int
convDim
=
2
;
const
int
convDim
=
2
;
int64_t
b_dim_padded
[
4
];
int64_t
b_dim_padded
[
4
];
...
@@ -279,183 +272,6 @@ create_conv_bias_add_act_descriptors(int64_t* x_dim_padded,
...
@@ -279,183 +272,6 @@ create_conv_bias_add_act_descriptors(int64_t* x_dim_padded,
.
build
());
.
build
());
}
}
using
masked_convbias_descriptors
=
std
::
tuple
<
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
>
;
masked_convbias_descriptors
create_conv_bias_add_act_mask_descriptors
(
int64_t
*
x_dim_padded
,
int64_t
*
padA
,
int64_t
*
convstrideA
,
int64_t
*
dilationA
,
int64_t
*
w_dim_padded
,
int64_t
*
y_dim_padded
,
int64_t
*
threshold_dim
,
cudnnDataType_t
dataType
)
{
const
int
convDim
=
2
;
int64_t
b_dim_padded
[
4
];
b_dim_padded
[
0
]
=
1
;
b_dim_padded
[
1
]
=
y_dim_padded
[
1
];
b_dim_padded
[
2
]
=
1
;
b_dim_padded
[
3
]
=
1
;
int64_t
x_stride_padded
[
4
];
int64_t
y_stride_padded
[
4
];
int64_t
w_stride_padded
[
4
];
int64_t
b_stride_padded
[
4
];
int64_t
threshold_stride
[
4
];
generateStrides
(
w_dim_padded
,
w_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
x_dim_padded
,
x_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
y_dim_padded
,
y_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
b_dim_padded
,
b_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
threshold_dim
,
threshold_stride
,
4
,
CUDNN_TENSOR_NHWC
);
return
masked_convbias_descriptors
(
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
x_dim_padded
)
.
setStrides
(
4
,
x_stride_padded
)
.
setId
(
'x'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'y'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
w_dim_padded
)
.
setStrides
(
4
,
w_stride_padded
)
.
setId
(
'w'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
b_dim_padded
)
.
setStrides
(
4
,
b_stride_padded
)
.
setId
(
'z'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
b_dim_padded
)
.
setStrides
(
4
,
b_stride_padded
)
.
setId
(
'b'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setVirtual
()
.
setId
(
'A'
)
// after add
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setVirtual
()
.
setId
(
'B'
)
// after bias
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'C'
)
// after conv
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'i'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'D'
)
// after optional add
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'E'
)
// after act for masked
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'I'
)
// output of the gen index operation
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_INT32
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'm'
)
// top half of the mask created after the less than
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_BOOLEAN
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'n'
)
// bottom half of the mask
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_BOOLEAN
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'M'
)
// OR of the top and bottom masks
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_BOOLEAN
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
threshold_dim
)
.
setStrides
(
4
,
threshold_stride
)
.
setId
(
't'
)
// threshold for creating the top mask
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_INT32
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
threshold_dim
)
.
setStrides
(
4
,
threshold_stride
)
.
setId
(
'u'
)
// threshold for creating the bottom mask
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_INT32
)
.
build
());
}
// tensor descriptors used for dgrad
// tensor descriptors used for dgrad
enum
{
enum
{
X_OR_DX_TENSOR
,
X_OR_DX_TENSOR
,
...
@@ -465,14 +281,6 @@ enum {
...
@@ -465,14 +281,6 @@ enum {
RELU_TENSOR
,
RELU_TENSOR
,
AFTER_DCONV_TENSOR
,
AFTER_DCONV_TENSOR
,
AFTER_DRELU_TENSOR
,
AFTER_DRELU_TENSOR
,
DGRAD_INPUT_TENSOR
,
DGRAD_OPTIONAL_TENSOR
,
DGRAD_GEN_INDEX_TENSOR
,
DGRAD_MASK_TOP_TENSOR
,
DGRAD_MASK_BOTTOM_TENSOR
,
DGRAD_MASK_TENSOR
,
DGRAD_THRESHOLD_TOP_TENSOR
,
DGRAD_THRESHOLD_BOTTOM_TENSOR
,
};
};
using
dconv_descriptors
=
std
::
tuple
<
cudnn_frontend
::
Tensor
,
using
dconv_descriptors
=
std
::
tuple
<
cudnn_frontend
::
Tensor
,
...
@@ -481,8 +289,6 @@ using dconv_descriptors = std::tuple<cudnn_frontend::Tensor,
...
@@ -481,8 +289,6 @@ using dconv_descriptors = std::tuple<cudnn_frontend::Tensor,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
>
;
cudnn_frontend
::
Tensor
>
;
dconv_descriptors
dconv_descriptors
...
@@ -561,194 +367,20 @@ create_dconv_descriptors(int64_t* x_dim_padded,
...
@@ -561,194 +367,20 @@ create_dconv_descriptors(int64_t* x_dim_padded,
.
setId
(
'B'
)
// after drelu
.
setId
(
'B'
)
// after drelu
.
setAlignment
(
16
)
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
.
build
());
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'i'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'D'
)
// after optional add
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
());
}
}
using
dconv_mask_descriptors
=
std
::
tuple
<
cudnn_frontend
::
Tensor
,
// create a cache for plan
cudnn_frontend
::
Tensor
,
std
::
unordered_map
<
std
::
string
,
cudnn_frontend
::
ExecutionPlan
>
plan_cache
;
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
>
;
dconv_mask_descriptors
create_dconv_mask_descriptors
(
int64_t
*
x_dim_padded
,
int64_t
*
padA
,
int64_t
*
convstrideA
,
int64_t
*
dilationA
,
int64_t
*
w_dim_padded
,
int64_t
*
y_dim_padded
,
int64_t
*
threshold_dim
,
cudnnDataType_t
dataType
)
{
const
int
convDim
=
2
;
int64_t
b_dim_padded
[
4
];
// TODO: better name
b_dim_padded
[
0
]
=
1
;
std
::
string
getConvFusionString
(
int64_t
*
x_dim_padded
,
b_dim_padded
[
1
]
=
x_dim_padded
[
1
];
int64_t
*
padA
,
b_dim_padded
[
2
]
=
1
;
int64_t
*
convstrideA
,
b_dim_padded
[
3
]
=
1
;
int64_t
*
dilationA
,
int64_t
*
w_dim_padded
,
int64_t
x_stride_padded
[
4
];
cudnnDataType_t
dataType
,
int64_t
y_stride_padded
[
4
];
std
::
string
fusion_string
)
{
int64_t
w_stride_padded
[
4
];
int64_t
b_stride_padded
[
4
];
int64_t
threshold_stride
[
4
];
generateStrides
(
w_dim_padded
,
w_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
x_dim_padded
,
x_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
y_dim_padded
,
y_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
b_dim_padded
,
b_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
threshold_dim
,
threshold_stride
,
4
,
CUDNN_TENSOR_NHWC
);
return
dconv_mask_descriptors
(
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
x_dim_padded
)
.
setStrides
(
4
,
x_stride_padded
)
.
setId
(
'x'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'y'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
w_dim_padded
)
.
setStrides
(
4
,
w_stride_padded
)
.
setId
(
'w'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
b_dim_padded
)
.
setStrides
(
4
,
b_stride_padded
)
.
setId
(
's'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
x_dim_padded
)
.
setStrides
(
4
,
x_stride_padded
)
.
setId
(
'r'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
x_dim_padded
)
.
setStrides
(
4
,
x_stride_padded
)
.
setVirtual
()
.
setId
(
'A'
)
// after dconv
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
x_dim_padded
)
.
setStrides
(
4
,
x_stride_padded
)
.
setVirtual
()
.
setId
(
'B'
)
// after drelu
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'i'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'D'
)
// after optional add
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'I'
)
// output of the gen index operation
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_INT32
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'm'
)
// top half of the mask created after the less than
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_BOOLEAN
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'n'
)
// bottom half of the mask
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_BOOLEAN
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'M'
)
// OR of the top and bottom masks
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_BOOLEAN
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
threshold_dim
)
.
setStrides
(
4
,
threshold_stride
)
.
setId
(
't'
)
// threshold for creating the top mask
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_INT32
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
threshold_dim
)
.
setStrides
(
4
,
threshold_stride
)
.
setId
(
'u'
)
// threshold for creating the bottom mask
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_INT32
)
.
build
());
}
// create a cache for plan
std
::
unordered_map
<
std
::
string
,
cudnn_frontend
::
ExecutionPlan
>
plan_cache
;
// TODO: better name
std
::
string
getConvFusionString
(
int64_t
*
x_dim_padded
,
int64_t
*
padA
,
int64_t
*
convstrideA
,
int64_t
*
dilationA
,
int64_t
*
w_dim_padded
,
cudnnDataType_t
dataType
,
std
::
string
fusion_string
)
{
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
fusion_string
+=
'X'
;
fusion_string
+=
'X'
;
...
@@ -961,7 +593,7 @@ run_conv_scale_bias_add_activation(int64_t* x_dim_padded,
...
@@ -961,7 +593,7 @@ run_conv_scale_bias_add_activation(int64_t* x_dim_padded,
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
.
setHandle
(
handle_
)
.
setHandle
(
handle_
)
.
setOperationGraph
(
devPtrI
?
ops
.
size
()
:
ops
.
size
()
-
1
,
ops
.
data
())
.
setOperationGraph
(
devPtrI
?
ops
.
size
()
:
4
,
ops
.
data
())
.
build
();
.
build
();
// Create string encoding for plan caching
// Create string encoding for plan caching
...
@@ -996,19 +628,18 @@ run_conv_scale_bias_add_activation(int64_t* x_dim_padded,
...
@@ -996,19 +628,18 @@ run_conv_scale_bias_add_activation(int64_t* x_dim_padded,
}
}
void
void
run_conv_add_scale_bias_activation
(
int64_t
*
x_dim_padded
,
run_conv_scale_bias
(
int64_t
*
x_dim_padded
,
int64_t
*
pad
,
int64_t
*
pad
,
int64_t
*
convstride
,
int64_t
*
convstride
,
int64_t
*
dilation
,
int64_t
*
dilation
,
int64_t
*
w_dim_padded
,
int64_t
*
w_dim_padded
,
int64_t
*
y_dim_padded
,
int64_t
*
y_dim_padded
,
cudnnDataType_t
dataType
,
cudnnDataType_t
dataType
,
at
::
Half
*
devPtrX
,
at
::
Half
*
devPtrX
,
at
::
Half
*
devPtrW
,
at
::
Half
*
devPtrW
,
at
::
Half
*
devPtrY
,
at
::
Half
*
devPtrY
,
at
::
Half
*
devPtrZ
,
at
::
Half
*
devPtrZ
,
at
::
Half
*
devPtrB
,
at
::
Half
*
devPtrB
)
{
at
::
Half
*
devPtrI
)
{
cudnnHandle_t
handle_
=
torch
::
native
::
getCudnnHandle
();
cudnnHandle_t
handle_
=
torch
::
native
::
getCudnnHandle
();
std
::
stringstream
log_buf
;
std
::
stringstream
log_buf
;
try
{
try
{
...
@@ -1026,36 +657,21 @@ run_conv_add_scale_bias_activation(int64_t* x_dim_padded,
...
@@ -1026,36 +657,21 @@ run_conv_add_scale_bias_activation(int64_t* x_dim_padded,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTERBIAS_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTERBIAS_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTERCONV_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTERCONV_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
OPTIONAL
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
OPTIONAL
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTEROPT_TENSOR
>
(
tensors
).
describe
());
// Define the add operation
// Define the add operation
auto
scaleDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
auto
scaleDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_MUL
)
.
setMode
(
CUDNN_POINTWISE_MUL
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
scaleDesc
.
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
scaleDesc
.
describe
());
// Define the bias operation
// Define the bias operation
auto
biasDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_ADD
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
biasDesc
.
describe
());
// optional add
auto
addDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
auto
addDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_ADD
)
.
setMode
(
CUDNN_POINTWISE_ADD
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
addDesc
.
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
addDesc
.
describe
());
// Define the activation operation
auto
actDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_RELU_FWD
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
actDesc
.
describe
());
// Define the convolution problem
// Define the convolution problem
auto
convDesc
=
cudnn_frontend
::
ConvDescBuilder
()
auto
convDesc
=
cudnn_frontend
::
ConvDescBuilder
()
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
setDataType
(
CUDNN_DATA_FLOAT
)
...
@@ -1082,43 +698,26 @@ run_conv_add_scale_bias_activation(int64_t* x_dim_padded,
...
@@ -1082,43 +698,26 @@ run_conv_add_scale_bias_activation(int64_t* x_dim_padded,
.
build
();
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
conv_op
.
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
conv_op
.
describe
());
// create an add node.
auto
add_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
conv_op
.
getOutputTensor
())
.
setbDesc
(
std
::
get
<
OPTIONAL
>
(
tensors
))
.
setyDesc
(
std
::
get
<
AFTEROPT_TENSOR
>
(
tensors
))
.
setpwDesc
(
addDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
add_op
.
describe
());
// Create a Add Node with scaling parameters.
// Create a Add Node with scaling parameters.
auto
scale_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
auto
scale_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
add
_op
.
getOutputTensor
())
.
setxDesc
(
conv
_op
.
getOutputTensor
())
.
setbDesc
(
std
::
get
<
Z_TENSOR
>
(
tensors
))
.
setbDesc
(
std
::
get
<
Z_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
AFTERADD_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
AFTERADD_TENSOR
>
(
tensors
))
// TODO: change enum to aftermul
.
setpwDesc
(
scaleDesc
)
.
setpwDesc
(
scaleDesc
)
.
build
();
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
scale_op
.
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
scale_op
.
describe
());
// Create a Bias Node.
// Create a Bias Node.
auto
bias_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
auto
add_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
scale_op
.
getOutputTensor
())
.
setxDesc
(
scale_op
.
getOutputTensor
())
.
setbDesc
(
std
::
get
<
B_TENSOR
>
(
tensors
))
.
setbDesc
(
std
::
get
<
B_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
AFTERBIAS_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
Y_TENSOR
>
(
tensors
))
.
setpwDesc
(
biasDesc
)
.
setpwDesc
(
addDesc
)
.
build
();
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
bias_op
.
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
add_op
.
describe
());
// Create an Activation Node.
auto
act_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
bias_op
.
getOutputTensor
())
.
setyDesc
(
std
::
get
<
Y_TENSOR
>
(
tensors
))
.
setpwDesc
(
actDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
act_op
.
describe
());
// Create an Operation Graph. In this case it is convolution add bias activation
// Create an Operation Graph. In this case it is convolution add bias activation
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
5
>
ops
=
{
&
conv_op
,
&
add_op
,
&
scale_op
,
&
bias_op
,
&
act
_op
};
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
3
>
ops
=
{
&
conv_op
,
&
scale_op
,
&
add
_op
};
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
.
setHandle
(
handle_
)
.
setHandle
(
handle_
)
...
@@ -1140,12 +739,12 @@ run_conv_add_scale_bias_activation(int64_t* x_dim_padded,
...
@@ -1140,12 +739,12 @@ run_conv_add_scale_bias_activation(int64_t* x_dim_padded,
if
(
workspace_size
>
0
)
{
if
(
workspace_size
>
0
)
{
workspace_ptr
=
workspace_tensor
.
data_ptr
<
float
>
();
workspace_ptr
=
workspace_tensor
.
data_ptr
<
float
>
();
}
}
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
,
devPtrZ
,
devPtrB
,
devPtrI
};
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
,
devPtrZ
,
devPtrB
};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
,
'z'
,
'b'
,
'i'
};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
,
'z'
,
'b'
};
auto
variantPack
=
cudnn_frontend
::
VariantPackBuilder
()
auto
variantPack
=
cudnn_frontend
::
VariantPackBuilder
()
.
setWorkspacePointer
(
workspace_ptr
)
.
setWorkspacePointer
(
workspace_ptr
)
.
setDataPointers
(
6
,
data_ptrs
)
.
setDataPointers
(
5
,
data_ptrs
)
.
setUids
(
6
,
uids
)
.
setUids
(
5
,
uids
)
.
build
();
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
"variantPack "
<<
variantPack
.
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
"variantPack "
<<
variantPack
.
describe
());
cudnnStatus_t
status
=
cudnnBackendExecute
(
handle_
,
plan
.
get_raw_desc
(),
variantPack
.
get_raw_desc
());
cudnnStatus_t
status
=
cudnnBackendExecute
(
handle_
,
plan
.
get_raw_desc
(),
variantPack
.
get_raw_desc
());
...
@@ -1156,76 +755,35 @@ run_conv_add_scale_bias_activation(int64_t* x_dim_padded,
...
@@ -1156,76 +755,35 @@ run_conv_add_scale_bias_activation(int64_t* x_dim_padded,
}
}
}
}
void
run_conv_scale_bias_add_activation_mask
(
int64_t
*
x_dim_padded
,
int64_t
*
pad
,
int64_t
*
convstride
,
int64_t
*
dilation
,
int64_t
*
w_dim_padded
,
int64_t
*
y_dim_padded
,
int64_t
*
threshold_dim
,
cudnnDataType_t
dataType
,
at
::
Half
*
devPtrX
,
at
::
Half
*
devPtrW
,
at
::
Half
*
devPtrY
,
at
::
Half
*
devPtrZ
,
at
::
Half
*
devPtrB
,
at
::
Half
*
devPtrI
,
int
*
devPtrT
,
int
*
devPtrU
,
int
axis
)
{
cudnnHandle_t
handle_
=
torch
::
native
::
getCudnnHandle
();
std
::
stringstream
log_buf
;
try
{
int
convDim
=
2
;
// Creates the necessary tensor descriptors
void
masked_convbias_descriptors
tensors
=
create_conv_bias_add_act_mask_descriptors
(
run_dconv_drelu_dscale
(
int64_t
*
x_dim_padded
,
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
y_dim_padded
,
threshold_dim
,
dataType
);
int64_t
*
pad
,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
X_TENSOR
>
(
tensors
).
describe
());
int64_t
*
convstride
,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
Y_TENSOR
>
(
tensors
).
describe
());
int64_t
*
dilation
,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
W_TENSOR
>
(
tensors
).
describe
());
int64_t
*
w_dim_padded
,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
Z_TENSOR
>
(
tensors
).
describe
());
int64_t
*
y_dim_padded
,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
B_TENSOR
>
(
tensors
).
describe
());
cudnnDataType_t
dataType
,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTERADD_TENSOR
>
(
tensors
).
describe
());
at
::
Half
*
devPtrX
,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTERBIAS_TENSOR
>
(
tensors
).
describe
());
at
::
Half
*
devPtrW
,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTERCONV_TENSOR
>
(
tensors
).
describe
());
at
::
Half
*
devPtrY
,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
OPTIONAL
>
(
tensors
).
describe
());
at
::
Half
*
devPtrZ
,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTERACT_TENSOR
>
(
tensors
).
describe
());
at
::
Half
*
devPtrR
)
{
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
GEN_INDEX_TENSOR
>
(
tensors
).
describe
());
cudnnHandle_t
handle_
=
torch
::
native
::
getCudnnHandle
();
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
MASK_TOP_TENSOR
>
(
tensors
).
describe
());
std
::
stringstream
log_buf
;
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
MASK_BOTTOM_TENSOR
>
(
tensors
).
describe
());
try
{
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
MASK_TENSOR
>
(
tensors
).
describe
());
int
convDim
=
2
;
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
THRESHOLD_TOP_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
THRESHOLD_BOTTOM_TENSOR
>
(
tensors
).
describe
());
// Define the add operation
auto
scaleDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_MUL
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
scaleDesc
.
describe
());
// Define the bias operation
auto
biasDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_ADD
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
biasDesc
.
describe
());
// optional add
auto
addDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_ADD
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
addDesc
.
describe
());
// Define the activation operation
// Creates the necessary tensor descriptors
auto
actDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
dconv_descriptors
tensors
=
create_dconv_descriptors
(
.
setMode
(
CUDNN_POINTWISE_RELU_FWD
)
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
y_dim_padded
,
dataType
);
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
).
describe
());
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DY_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
actDesc
.
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
W_OR_DW_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
SCALE_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
RELU_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
).
describe
());
// Define the convolution problem
// Define the convolution problem
auto
convDesc
=
cudnn_frontend
::
ConvDescBuilder
()
auto
convDesc
=
cudnn_frontend
::
ConvDescBuilder
()
...
@@ -1239,258 +797,119 @@ run_conv_scale_bias_add_activation_mask(int64_t* x_dim_padded,
...
@@ -1239,258 +797,119 @@ run_conv_scale_bias_add_activation_mask(int64_t* x_dim_padded,
.
build
();
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
convDesc
.
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
convDesc
.
describe
());
// Define the genIndex descriptor
// Define the activation backward operation
auto
genIndexDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
auto
actDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_GEN_INDEX
)
.
setMode
(
CUDNN_POINTWISE_RELU_BWD
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
setAxis
(
axis
)
.
build
();
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
actDesc
.
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
genIndexDesc
.
describe
());
// Define the lessThan descriptor
auto
lessThanDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_CMP_LT
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
lessThanDesc
.
describe
());
// Define the greaterThan descriptor
auto
greaterThanDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_CMP_GT
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
greaterThanDesc
.
describe
());
// Define the logical_or descriptor
auto
logicalOrDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_LOGICAL_OR
)
.
setMathPrecision
(
CUDNN_DATA_BOOLEAN
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
logicalOrDesc
.
describe
());
// Define the
binary_selection descriptor
// Define the
scale backward operation
auto
s
election
Desc
=
cudnn_frontend
::
PointWiseDescBuilder
()
auto
s
cale
Desc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_
BINARY_SELECT
)
.
setMode
(
CUDNN_POINTWISE_
MUL
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
s
election
Desc
.
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
s
cale
Desc
.
describe
());
float
alpha
=
1.0
f
;
float
alpha
=
1.0
f
;
float
beta
=
0.0
f
;
float
beta
=
0.0
f
;
// Create a convolution Node
// Create a convolution Node
auto
conv_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_CONVOLUTION_
FORWARD
_DESCRIPTOR
)
auto
conv_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_CONVOLUTION_
BACKWARD_DATA
_DESCRIPTOR
)
.
setxDesc
(
std
::
get
<
X
_TENSOR
>
(
tensors
))
.
set
d
xDesc
(
std
::
get
<
AFTER_DCONV
_TENSOR
>
(
tensors
))
.
setwDesc
(
std
::
get
<
W_TENSOR
>
(
tensors
))
.
setwDesc
(
std
::
get
<
W_
OR_DW_
TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
AFTERCONV
_TENSOR
>
(
tensors
))
.
set
d
yDesc
(
std
::
get
<
DY
_TENSOR
>
(
tensors
))
.
setcDesc
(
convDesc
)
.
setcDesc
(
convDesc
)
.
setAlpha
(
alpha
)
.
setAlpha
(
alpha
)
.
setBeta
(
beta
)
.
setBeta
(
beta
)
.
build
();
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
conv_op
.
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
conv_op
.
describe
());
// Create a Add Node with scaling parameters.
// TODO: do we need getOutputTensor(), and what it returns in backward case?
auto
scale_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
// Create an relu backward Node.
.
setxDesc
(
conv_op
.
getOutputTensor
())
.
setbDesc
(
std
::
get
<
Z_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
AFTERADD_TENSOR
>
(
tensors
))
.
setpwDesc
(
scaleDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
scale_op
.
describe
());
// Create a Bias Node.
auto
bias_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
scale_op
.
getOutputTensor
())
.
setbDesc
(
std
::
get
<
B_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
AFTERBIAS_TENSOR
>
(
tensors
))
.
setpwDesc
(
biasDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
bias_op
.
describe
());
// Create a optional add Node.
auto
add_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
bias_op
.
getOutputTensor
())
.
setbDesc
(
std
::
get
<
OPTIONAL
>
(
tensors
))
.
setyDesc
(
std
::
get
<
AFTEROPT_TENSOR
>
(
tensors
))
.
setpwDesc
(
addDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
add_op
.
describe
());
// Create an Activation Node.
auto
act_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
auto
act_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
devPtrI
?
add_op
.
getOutputTensor
()
:
bias_op
.
getOutputTensor
())
.
setdyDesc
(
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
AFTERACT_TENSOR
>
(
tensors
))
.
setxDesc
(
std
::
get
<
RELU_TENSOR
>
(
tensors
))
.
setpwDesc
(
actDesc
)
.
setdxDesc
(
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
))
.
build
();
.
setpwDesc
(
actDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
act_op
.
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
act_op
.
describe
());
// Create a Gen_Index Node.
// Create a Scale Node.
auto
genIndex_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
auto
scale_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
std
::
get
<
AFTERACT_TENSOR
>
(
tensors
))
.
setxDesc
(
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
GEN_INDEX_TENSOR
>
(
tensors
))
.
setbDesc
(
std
::
get
<
SCALE_TENSOR
>
(
tensors
))
.
setpwDesc
(
genIndexDesc
)
.
setyDesc
(
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
))
.
build
();
.
setpwDesc
(
scaleDesc
)
DEBUG_CUDNN_MSG
(
log_buf
,
genIndex_op
.
describe
());
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
scale_op
.
describe
());
// Create a LessThan Node.
auto
lessThan_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
std
::
get
<
GEN_INDEX_TENSOR
>
(
tensors
))
.
setbDesc
(
std
::
get
<
THRESHOLD_TOP_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
MASK_TOP_TENSOR
>
(
tensors
))
.
setpwDesc
(
lessThanDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
lessThan_op
.
describe
());
// Create a GreaterThan Node.
auto
greaterThan_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
std
::
get
<
GEN_INDEX_TENSOR
>
(
tensors
))
.
setbDesc
(
std
::
get
<
THRESHOLD_BOTTOM_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
MASK_BOTTOM_TENSOR
>
(
tensors
))
.
setpwDesc
(
greaterThanDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
greaterThan_op
.
describe
());
// Create a LogicalOr Node.
auto
logicalOr_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
std
::
get
<
MASK_TOP_TENSOR
>
(
tensors
))
.
setbDesc
(
std
::
get
<
MASK_BOTTOM_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
MASK_TENSOR
>
(
tensors
))
.
setpwDesc
(
logicalOrDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
logicalOr_op
.
describe
());
// Create a Binary_Selection Node.
auto
selection_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
std
::
get
<
AFTERCONV_TENSOR
>
(
tensors
))
.
setbDesc
(
std
::
get
<
AFTERACT_TENSOR
>
(
tensors
))
.
settDesc
(
std
::
get
<
MASK_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
Y_TENSOR
>
(
tensors
))
.
setpwDesc
(
selectionDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
selection_op
.
describe
());
// Create an Operation Graph. In this case it is convolution add bias activation
// Create an Operation Graph. In this case it is convolution add bias activation
if
(
devPtrI
)
{
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
3
>
ops
=
{
&
conv_op
,
&
act_op
,
&
scale_op
};
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
10
>
ops
=
{
&
conv_op
,
&
scale_op
,
&
bias_op
,
&
add_op
,
&
act_op
,
&
genIndex_op
,
&
lessThan_op
,
&
greaterThan_op
,
&
logicalOr_op
,
&
selection_op
};
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
.
setHandle
(
handle_
)
.
setHandle
(
handle_
)
.
setOperationGraph
(
ops
.
size
(),
ops
.
data
())
.
setOperationGraph
(
ops
.
size
(),
ops
.
data
())
.
build
();
.
build
();
// Create string encoding for plan caching
// Create string encoding for plan caching
auto
cache_string
=
getConvFusionString
(
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
dataType
,
opGraph
.
getTag
());
auto
cache_string
=
getConvFusionString
(
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
dataType
,
opGraph
.
getTag
());
DEBUG_CUDNN_MSG
(
log_buf
,
"[convstring] "
<<
cache_string
);
DEBUG_CUDNN_MSG
(
log_buf
,
"[convstring] "
<<
cache_string
);
auto
&
plan
=
getOrCreatePlan
(
handle_
,
log_buf
,
opGraph
,
cache_string
);
auto
&
plan
=
getOrCreatePlan
(
handle_
,
log_buf
,
opGraph
,
cache_string
);
DEBUG_CUDNN_MSG
(
log_buf
,
"Plan tag: "
<<
plan
.
getTag
());
DEBUG_CUDNN_MSG
(
log_buf
,
"Plan tag: "
<<
plan
.
getTag
());
auto
workspace_size
=
plan
.
getWorkspaceSize
();
auto
workspace_size
=
plan
.
getWorkspaceSize
();
DEBUG_CUDNN_MSG
(
log_buf
,
plan
.
describe
()
<<
" requires workspace "
<<
workspace_size
);
DEBUG_CUDNN_MSG
(
log_buf
,
plan
.
describe
()
<<
" requires workspace "
<<
workspace_size
);
void
*
workspace_ptr
=
nullptr
;
void
*
workspace_ptr
=
nullptr
;
auto
workspace_tensor
=
at
::
empty
({(
workspace_size
+
3
)
/
4
},
at
::
TensorOptions
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
auto
workspace_tensor
=
at
::
empty
({(
workspace_size
+
3
)
/
4
},
at
::
TensorOptions
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
if
(
workspace_size
>
0
)
{
if
(
workspace_size
>
0
)
{
workspace_ptr
=
workspace_tensor
.
data_ptr
<
float
>
();
workspace_ptr
=
workspace_tensor
.
data_ptr
<
float
>
();
}
}
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
,
devPtrZ
,
devPtrB
,
devPtrI
,
devPtrT
,
devPtrU
};
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
,
devPtrZ
,
devPtrR
};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
,
'z'
,
'b'
,
'i'
,
't'
,
'u'
};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
,
's'
,
'r'
};
auto
variantPack
=
cudnn_frontend
::
VariantPackBuilder
()
auto
variantPack
=
cudnn_frontend
::
VariantPackBuilder
()
.
setWorkspacePointer
(
workspace_ptr
)
.
setWorkspacePointer
(
workspace_ptr
)
.
setDataPointers
(
8
,
data_ptrs
)
.
setDataPointers
(
5
,
data_ptrs
)
.
setUids
(
8
,
uids
)
.
setUids
(
5
,
uids
)
.
build
();
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
"variantPack "
<<
variantPack
.
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
"variantPack "
<<
variantPack
.
describe
());
cudnnStatus_t
status
=
cudnnBackendExecute
(
handle_
,
plan
.
get_raw_desc
(),
variantPack
.
get_raw_desc
());
cudnnStatus_t
status
=
cudnnBackendExecute
(
handle_
,
plan
.
get_raw_desc
(),
variantPack
.
get_raw_desc
());
checkCudnnErr
(
status
);
checkCudnnErr
(
status
);
cudnn_frontend
::
throw_if
([
status
]()
{
return
(
status
!=
CUDNN_STATUS_SUCCESS
);
},
"Plan execute error"
,
status
);
cudnn_frontend
::
throw_if
([
status
]()
{
return
(
status
!=
CUDNN_STATUS_SUCCESS
);
},
"Plan execute error"
,
status
);
}
else
{
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
9
>
ops
=
{
&
conv_op
,
&
scale_op
,
&
bias_op
,
&
act_op
,
&
genIndex_op
,
&
lessThan_op
,
&
greaterThan_op
,
&
logicalOr_op
,
&
selection_op
};
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
.
setHandle
(
handle_
)
.
setOperationGraph
(
ops
.
size
(),
ops
.
data
())
.
build
();
// Create string encoding for plan caching
auto
cache_string
=
getConvFusionString
(
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
dataType
,
opGraph
.
getTag
());
DEBUG_CUDNN_MSG
(
log_buf
,
"[convstring] "
<<
cache_string
);
auto
&
plan
=
getOrCreatePlan
(
handle_
,
log_buf
,
opGraph
,
cache_string
);
DEBUG_CUDNN_MSG
(
log_buf
,
"Plan tag: "
<<
plan
.
getTag
());
auto
workspace_size
=
plan
.
getWorkspaceSize
();
DEBUG_CUDNN_MSG
(
log_buf
,
plan
.
describe
()
<<
" requires workspace "
<<
workspace_size
);
void
*
workspace_ptr
=
nullptr
;
auto
workspace_tensor
=
at
::
empty
({(
workspace_size
+
3
)
/
4
},
at
::
TensorOptions
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
if
(
workspace_size
>
0
)
{
workspace_ptr
=
workspace_tensor
.
data_ptr
<
float
>
();
}
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
,
devPtrZ
,
devPtrB
,
devPtrT
,
devPtrU
};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
,
'z'
,
'b'
,
't'
,
'u'
};
auto
variantPack
=
cudnn_frontend
::
VariantPackBuilder
()
.
setWorkspacePointer
(
workspace_ptr
)
.
setDataPointers
(
7
,
data_ptrs
)
.
setUids
(
7
,
uids
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
"variantPack "
<<
variantPack
.
describe
());
cudnnStatus_t
status
=
cudnnBackendExecute
(
handle_
,
plan
.
get_raw_desc
(),
variantPack
.
get_raw_desc
());
checkCudnnErr
(
status
);
cudnn_frontend
::
throw_if
([
status
]()
{
return
(
status
!=
CUDNN_STATUS_SUCCESS
);
},
"Plan execute error"
,
status
);
}
}
catch
(
cudnn_frontend
::
cudnnException
e
)
{
}
catch
(
cudnn_frontend
::
cudnnException
e
)
{
std
::
cout
<<
log_buf
.
str
()
<<
"[ERROR] Exception "
<<
e
.
what
()
<<
std
::
endl
;
std
::
cout
<<
log_buf
.
str
()
<<
"[ERROR] Exception "
<<
e
.
what
()
<<
std
::
endl
;
}
}
}
}
void
void
run_conv_scale_bias
(
int64_t
*
x_dim_padded
,
run_dconv
(
int64_t
*
x_dim_padded
,
int64_t
*
pad
,
int64_t
*
pad
,
int64_t
*
convstride
,
int64_t
*
convstride
,
int64_t
*
dilation
,
int64_t
*
dilation
,
int64_t
*
w_dim_padded
,
int64_t
*
w_dim_padded
,
int64_t
*
y_dim_padded
,
int64_t
*
y_dim_padded
,
cudnnDataType_t
dataType
,
cudnnDataType_t
dataType
,
at
::
Half
*
devPtrX
,
at
::
Half
*
devPtrX
,
at
::
Half
*
devPtrW
,
at
::
Half
*
devPtrW
,
at
::
Half
*
devPtrY
,
at
::
Half
*
devPtrY
,
at
::
Half
*
devPtrZ
,
cudnnBackendDescriptorType_t
mode
)
{
at
::
Half
*
devPtrB
)
{
cudnnHandle_t
handle_
=
torch
::
native
::
getCudnnHandle
();
cudnnHandle_t
handle_
=
torch
::
native
::
getCudnnHandle
();
std
::
stringstream
log_buf
;
std
::
stringstream
log_buf
;
try
{
try
{
int
convDim
=
2
;
int
convDim
=
2
;
// Creates the necessary tensor descriptors
// Creates the necessary tensor descriptors
common_convbias
_descriptors
tensors
=
create_conv_
bias_add_act_
descriptors
(
dconv
_descriptors
tensors
=
create_
d
conv_descriptors
(
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
y_dim_padded
,
dataType
);
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
y_dim_padded
,
dataType
);
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
X_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
Y_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DY_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
W_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
W_OR_DW_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
Z_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
SCALE_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
B_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
RELU_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTERADD_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTERBIAS_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTERCONV_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
OPTIONAL
>
(
tensors
).
describe
());
// Define the add operation
auto
scaleDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_MUL
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
scaleDesc
.
describe
());
// Define the bias operation
auto
addDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_ADD
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
addDesc
.
describe
());
// Define the convolution problem
// Define the convolution problem
auto
convDesc
=
cudnn_frontend
::
ConvDescBuilder
()
auto
convDesc
=
cudnn_frontend
::
ConvDescBuilder
()
...
@@ -1508,36 +927,31 @@ run_conv_scale_bias(int64_t* x_dim_padded,
...
@@ -1508,36 +927,31 @@ run_conv_scale_bias(int64_t* x_dim_padded,
float
beta
=
0.0
f
;
float
beta
=
0.0
f
;
// Create a convolution Node
// Create a convolution Node
auto
conv_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR
)
// mode should be one of following
.
setxDesc
(
std
::
get
<
X_TENSOR
>
(
tensors
))
// CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
.
setwDesc
(
std
::
get
<
W_TENSOR
>
(
tensors
))
// CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
.
setyDesc
(
std
::
get
<
AFTERCONV_TENSOR
>
(
tensors
))
auto
conv_op_builder
=
cudnn_frontend
::
OperationBuilder
(
mode
);
.
setcDesc
(
convDesc
)
if
(
mode
==
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
)
{
.
setAlpha
(
alpha
)
conv_op_builder
.
setdxDesc
(
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
))
.
setBeta
(
beta
)
.
setwDesc
(
std
::
get
<
W_OR_DW_TENSOR
>
(
tensors
))
.
build
();
.
setdyDesc
(
std
::
get
<
DY_TENSOR
>
(
tensors
))
.
setcDesc
(
convDesc
)
.
setAlpha
(
alpha
)
.
setBeta
(
beta
);
}
else
{
conv_op_builder
.
setxDesc
(
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
))
.
setdwDesc
(
std
::
get
<
W_OR_DW_TENSOR
>
(
tensors
))
.
setdyDesc
(
std
::
get
<
DY_TENSOR
>
(
tensors
))
.
setcDesc
(
convDesc
)
.
setAlpha
(
alpha
)
.
setBeta
(
beta
);
}
auto
conv_op
=
conv_op_builder
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
conv_op
.
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
conv_op
.
describe
());
// Create a Add Node with scaling parameters.
auto
scale_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
conv_op
.
getOutputTensor
())
.
setbDesc
(
std
::
get
<
Z_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
AFTERADD_TENSOR
>
(
tensors
))
// TODO: change enum to aftermul
.
setpwDesc
(
scaleDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
scale_op
.
describe
());
// Create a Bias Node.
auto
add_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
scale_op
.
getOutputTensor
())
.
setbDesc
(
std
::
get
<
B_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
Y_TENSOR
>
(
tensors
))
.
setpwDesc
(
addDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
add_op
.
describe
());
// Create an Operation Graph. In this case it is convolution add bias activation
// Create an Operation Graph. In this case it is convolution add bias activation
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
3
>
ops
=
{
&
conv_op
,
&
scale_op
,
&
add_op
};
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
1
>
ops
=
{
&
conv_op
};
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
.
setHandle
(
handle_
)
.
setHandle
(
handle_
)
...
@@ -1559,13 +973,13 @@ run_conv_scale_bias(int64_t* x_dim_padded,
...
@@ -1559,13 +973,13 @@ run_conv_scale_bias(int64_t* x_dim_padded,
if
(
workspace_size
>
0
)
{
if
(
workspace_size
>
0
)
{
workspace_ptr
=
workspace_tensor
.
data_ptr
<
float
>
();
workspace_ptr
=
workspace_tensor
.
data_ptr
<
float
>
();
}
}
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
,
devPtrZ
,
devPtrB
};
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
,
'z'
,
'b'
};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
};
auto
variantPack
=
cudnn_frontend
::
VariantPackBuilder
()
auto
variantPack
=
cudnn_frontend
::
VariantPackBuilder
()
.
setWorkspacePointer
(
workspace_ptr
)
.
setWorkspacePointer
(
workspace_ptr
)
.
setDataPointers
(
5
,
data_ptrs
)
.
setDataPointers
(
3
,
data_ptrs
)
.
setUids
(
5
,
uids
)
.
setUids
(
3
,
uids
)
.
build
();
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
"variantPack "
<<
variantPack
.
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
"variantPack "
<<
variantPack
.
describe
());
cudnnStatus_t
status
=
cudnnBackendExecute
(
handle_
,
plan
.
get_raw_desc
(),
variantPack
.
get_raw_desc
());
cudnnStatus_t
status
=
cudnnBackendExecute
(
handle_
,
plan
.
get_raw_desc
(),
variantPack
.
get_raw_desc
());
checkCudnnErr
(
status
);
checkCudnnErr
(
status
);
...
@@ -1575,20 +989,18 @@ run_conv_scale_bias(int64_t* x_dim_padded,
...
@@ -1575,20 +989,18 @@ run_conv_scale_bias(int64_t* x_dim_padded,
}
}
}
}
void
void
run_dconv_drelu_dscale
(
int64_t
*
x_dim_padded
,
run_dconv_add
(
int64_t
*
x_dim_padded
,
int64_t
*
pad
,
int64_t
*
pad
,
int64_t
*
convstride
,
int64_t
*
convstride
,
int64_t
*
dilation
,
int64_t
*
dilation
,
int64_t
*
w_dim_padded
,
int64_t
*
w_dim_padded
,
int64_t
*
y_dim_padded
,
int64_t
*
y_dim_padded
,
cudnnDataType_t
dataType
,
cudnnDataType_t
dataType
,
at
::
Half
*
devPtrX
,
at
::
Half
*
devPtrX
,
at
::
Half
*
devPtrW
,
at
::
Half
*
devPtrW
,
at
::
Half
*
devPtrY
,
at
::
Half
*
devPtrY
,
at
::
Half
*
devPtrZ
,
at
::
Half
*
devPtrR
)
{
at
::
Half
*
devPtrR
)
{
cudnnHandle_t
handle_
=
torch
::
native
::
getCudnnHandle
();
cudnnHandle_t
handle_
=
torch
::
native
::
getCudnnHandle
();
std
::
stringstream
log_buf
;
std
::
stringstream
log_buf
;
try
{
try
{
...
@@ -1617,19 +1029,12 @@ run_dconv_drelu_dscale(int64_t* x_dim_padded,
...
@@ -1617,19 +1029,12 @@ run_dconv_drelu_dscale(int64_t* x_dim_padded,
.
build
();
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
convDesc
.
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
convDesc
.
describe
());
// Define the activation backward operation
// Define the add backward operation
auto
actDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
auto
addDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_RELU_BWD
)
.
setMode
(
CUDNN_POINTWISE_ADD
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
actDesc
.
describe
());
// Define the scale backward operation
auto
scaleDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_MUL
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
scale
Desc
.
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
add
Desc
.
describe
());
float
alpha
=
1.0
f
;
float
alpha
=
1.0
f
;
float
beta
=
0.0
f
;
float
beta
=
0.0
f
;
...
@@ -1646,26 +1051,17 @@ run_dconv_drelu_dscale(int64_t* x_dim_padded,
...
@@ -1646,26 +1051,17 @@ run_dconv_drelu_dscale(int64_t* x_dim_padded,
DEBUG_CUDNN_MSG
(
log_buf
,
conv_op
.
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
conv_op
.
describe
());
// TODO: do we need getOutputTensor(), and what it returns in backward case?
// TODO: do we need getOutputTensor(), and what it returns in backward case?
// Create an relu backward Node.
// Create add Node.
auto
act_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
auto
add_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setdyDesc
(
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
))
.
setxDesc
(
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
))
.
setxDesc
(
std
::
get
<
RELU_TENSOR
>
(
tensors
))
.
setbDesc
(
std
::
get
<
RELU_TENSOR
>
(
tensors
))
.
setdxDesc
(
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
))
.
setpwDesc
(
actDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
act_op
.
describe
());
// Create a Scale Node.
auto
scale_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
))
.
setbDesc
(
std
::
get
<
SCALE_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
))
.
setpwDesc
(
scale
Desc
)
.
setpwDesc
(
add
Desc
)
.
build
();
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
scale
_op
.
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
add
_op
.
describe
());
// Create an Operation Graph. In this case it is convolution add bias activation
// Create an Operation Graph. In this case it is convolution add bias activation
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
3
>
ops
=
{
&
conv_op
,
&
a
ct_op
,
&
scale
_op
};
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
2
>
ops
=
{
&
conv_op
,
&
a
dd
_op
};
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
.
setHandle
(
handle_
)
.
setHandle
(
handle_
)
...
@@ -1687,12 +1083,12 @@ run_dconv_drelu_dscale(int64_t* x_dim_padded,
...
@@ -1687,12 +1083,12 @@ run_dconv_drelu_dscale(int64_t* x_dim_padded,
if
(
workspace_size
>
0
)
{
if
(
workspace_size
>
0
)
{
workspace_ptr
=
workspace_tensor
.
data_ptr
<
float
>
();
workspace_ptr
=
workspace_tensor
.
data_ptr
<
float
>
();
}
}
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
,
devPtrZ
,
devPtrR
};
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
,
devPtrR
};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
,
's'
,
'r'
};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
,
'r'
};
auto
variantPack
=
cudnn_frontend
::
VariantPackBuilder
()
auto
variantPack
=
cudnn_frontend
::
VariantPackBuilder
()
.
setWorkspacePointer
(
workspace_ptr
)
.
setWorkspacePointer
(
workspace_ptr
)
.
setDataPointers
(
5
,
data_ptrs
)
.
setDataPointers
(
4
,
data_ptrs
)
.
setUids
(
5
,
uids
)
.
setUids
(
4
,
uids
)
.
build
();
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
"variantPack "
<<
variantPack
.
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
"variantPack "
<<
variantPack
.
describe
());
cudnnStatus_t
status
=
cudnnBackendExecute
(
handle_
,
plan
.
get_raw_desc
(),
variantPack
.
get_raw_desc
());
cudnnStatus_t
status
=
cudnnBackendExecute
(
handle_
,
plan
.
get_raw_desc
(),
variantPack
.
get_raw_desc
());
...
@@ -1703,496 +1099,1467 @@ run_dconv_drelu_dscale(int64_t* x_dim_padded,
...
@@ -1703,496 +1099,1467 @@ run_dconv_drelu_dscale(int64_t* x_dim_padded,
}
}
}
}
void
run_dconv_add_drelu_dscale
(
int64_t
*
x_dim_padded
,
int64_t
*
pad
,
int64_t
*
convstride
,
int64_t
*
dilation
,
int64_t
*
w_dim_padded
,
int64_t
*
y_dim_padded
,
cudnnDataType_t
dataType
,
at
::
Half
*
devPtrX
,
at
::
Half
*
devPtrW
,
at
::
Half
*
devPtrY
,
at
::
Half
*
devPtrZ
,
at
::
Half
*
devPtrR
,
at
::
Half
*
devPtrI
)
{
cudnnHandle_t
handle_
=
torch
::
native
::
getCudnnHandle
();
std
::
stringstream
log_buf
;
try
{
int
convDim
=
2
;
// Creates the necessary tensor descriptors
// inputs contains x,w,z,b,(i)
dconv_descriptors
tensors
=
create_dconv_descriptors
(
std
::
vector
<
at
::
Tensor
>
bottleneck_forward
(
bool
explicit_nhwc
,
int
stride_1X1
,
std
::
vector
<
at
::
Tensor
>
inputs
)
{
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
y_dim_padded
,
dataType
);
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DY_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
W_OR_DW_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
SCALE_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
RELU_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_INPUT_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_OPTIONAL_TENSOR
>
(
tensors
).
describe
());
// Define the convolution problem
std
::
cout
<<
std
::
fixed
;
auto
convDesc
=
cudnn_frontend
::
ConvDescBuilder
()
// create output vector
.
setDataType
(
CUDNN_DATA_FLOAT
)
std
::
vector
<
at
::
Tensor
>
outputs
;
.
setMathMode
(
CUDNN_CROSS_CORRELATION
)
auto
output_format
=
explicit_nhwc
?
at
::
MemoryFormat
::
Contiguous
:
at
::
MemoryFormat
::
ChannelsLast
;
.
setNDims
(
convDim
)
.
setStrides
(
convDim
,
convstride
)
.
setPrePadding
(
convDim
,
pad
)
.
setPostPadding
(
convDim
,
pad
)
.
setDilation
(
convDim
,
dilation
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
convDesc
.
describe
());
// optional add
// setup dimensions
auto
addDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
int64_t
dimA
[]
=
{
0
,
0
,
0
,
0
};
.
setMode
(
CUDNN_POINTWISE_ADD
)
int64_t
filterdimA1
[]
=
{
0
,
0
,
0
,
0
};
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
int64_t
filterdimA2
[]
=
{
0
,
0
,
0
,
0
};
.
build
()
;
int64_t
filterdimA3
[]
=
{
0
,
0
,
0
,
0
}
;
DEBUG_CUDNN_MSG
(
log_buf
,
addDesc
.
describe
())
;
int64_t
filterdimA4
[]
=
{
0
,
0
,
0
,
0
}
;
// Define the activation backward operation
// All dim calculation after this order of n,c,h,w
auto
actDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
int
axis
[]
{
0
,
1
,
2
,
3
};
.
setMode
(
CUDNN_POINTWISE_RELU_BWD
)
if
(
explicit_nhwc
)
{
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
axis
[
0
]
=
0
;
.
build
();
axis
[
1
]
=
3
;
DEBUG_CUDNN_MSG
(
log_buf
,
actDesc
.
describe
());
axis
[
2
]
=
1
;
axis
[
3
]
=
2
;
}
for
(
int
dim
=
0
;
dim
<
4
;
dim
++
)
{
dimA
[
dim
]
=
inputs
[
0
].
size
(
axis
[
dim
]);
filterdimA1
[
dim
]
=
inputs
[
1
].
size
(
axis
[
dim
]);
filterdimA2
[
dim
]
=
inputs
[
2
].
size
(
axis
[
dim
]);
filterdimA3
[
dim
]
=
inputs
[
3
].
size
(
axis
[
dim
]);
}
if
(
stride_1X1
!=
1
||
filterdimA3
[
0
]
!=
dimA
[
1
])
{
for
(
int
dim
=
0
;
dim
<
4
;
dim
++
)
{
filterdimA4
[
dim
]
=
inputs
[
10
].
size
(
axis
[
dim
]);
}
}
// Define the scale backward operation
// output dim in n,c,h,w used by backend
auto
scaleDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
int64_t
outdimA1
[]
=
{
0
,
0
,
0
,
0
};
// Computed Below
.
setMode
(
CUDNN_POINTWISE_MUL
)
int64_t
outdimA2
[]
=
{
0
,
0
,
0
,
0
};
// Computed Below
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
int64_t
outdimA3
[]
=
{
0
,
0
,
0
,
0
};
// Computed Below
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
scaleDesc
.
describe
());
float
alpha
=
1.0
f
;
// use these fixed value for test run
float
beta
=
0.0
f
;
int64_t
padA
[]
=
{
0
,
0
};
int64_t
padA1
[]
=
{
1
,
1
};
int64_t
dilationA
[]
=
{
1
,
1
};
int64_t
convstrideA
[]
=
{
1
,
1
};
int64_t
convstride1X1
[]
=
{
stride_1X1
,
stride_1X1
};
// Create a convolution Node
// compute output from pad/stride/dilation
auto
conv_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
)
outdimA1
[
0
]
=
dimA
[
0
];
.
setdxDesc
(
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
))
outdimA1
[
1
]
=
filterdimA1
[
0
];
.
setwDesc
(
std
::
get
<
W_OR_DW_TENSOR
>
(
tensors
))
for
(
int
dim
=
0
;
dim
<
2
;
dim
++
)
{
.
setdyDesc
(
std
::
get
<
DY_TENSOR
>
(
tensors
))
outdimA1
[
dim
+
2
]
=
getFwdConvOutputDim
(
dimA
[
dim
+
2
],
padA
[
dim
],
filterdimA1
[
dim
+
2
],
convstride1X1
[
dim
],
dilationA
[
dim
]);
.
setcDesc
(
convDesc
)
}
.
setAlpha
(
alpha
)
.
setBeta
(
beta
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
conv_op
.
describe
());
// Create add Node.
outdimA2
[
0
]
=
outdimA1
[
0
];
auto
add_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
outdimA2
[
1
]
=
filterdimA2
[
0
];
.
setxDesc
(
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
))
for
(
int
dim
=
0
;
dim
<
2
;
dim
++
)
{
.
setbDesc
(
std
::
get
<
DGRAD_INPUT_TENSOR
>
(
tensors
))
outdimA2
[
dim
+
2
]
=
getFwdConvOutputDim
(
outdimA1
[
dim
+
2
],
padA1
[
dim
],
filterdimA2
[
dim
+
2
],
convstrideA
[
dim
],
dilationA
[
dim
]);
.
setyDesc
(
std
::
get
<
DGRAD_OPTIONAL_TENSOR
>
(
tensors
))
}
.
setpwDesc
(
addDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
add_op
.
describe
());
// TODO: do we need getOutputTensor(), and what it returns in backward case?
outdimA3
[
0
]
=
outdimA2
[
0
];
// Create an relu backward Node.
outdimA3
[
1
]
=
filterdimA3
[
0
];
auto
act_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
for
(
int
dim
=
0
;
dim
<
2
;
dim
++
)
{
.
setdyDesc
(
std
::
get
<
DGRAD_OPTIONAL_TENSOR
>
(
tensors
))
outdimA3
[
dim
+
2
]
=
getFwdConvOutputDim
(
outdimA2
[
dim
+
2
],
padA
[
dim
],
filterdimA3
[
dim
+
2
],
convstrideA
[
dim
],
dilationA
[
dim
]);
.
setxDesc
(
std
::
get
<
RELU_TENSOR
>
(
tensors
))
}
.
setdxDesc
(
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
))
.
setpwDesc
(
actDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
act_op
.
describe
());
// Create a Scale Node.
// Create output tensor in the correct shape in pytorch's view
auto
scale_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
int64_t
outdim1
[]
=
{
0
,
0
,
0
,
0
};
.
setxDesc
(
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
))
int64_t
outdim2
[]
=
{
0
,
0
,
0
,
0
};
.
setbDesc
(
std
::
get
<
SCALE_TENSOR
>
(
tensors
))
int64_t
outdim3
[]
=
{
0
,
0
,
0
,
0
};
.
setyDesc
(
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
))
if
(
explicit_nhwc
)
{
.
setpwDesc
(
scaleDesc
)
axis
[
0
]
=
0
;
.
build
();
axis
[
1
]
=
2
;
DEBUG_CUDNN_MSG
(
log_buf
,
scale_op
.
describe
());
axis
[
2
]
=
3
;
axis
[
3
]
=
1
;
}
for
(
int
dim
=
0
;
dim
<
4
;
dim
++
)
{
outdim1
[
dim
]
=
outdimA1
[
axis
[
dim
]];
outdim2
[
dim
]
=
outdimA2
[
axis
[
dim
]];
outdim3
[
dim
]
=
outdimA3
[
axis
[
dim
]];
}
// Create an Operation Graph. In this case it is convolution add bias activation
// run
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
4
>
ops
=
{
&
conv_op
,
&
add_op
,
&
act_op
,
&
scale_op
};
at
::
Half
*
x
=
inputs
[
0
].
data_ptr
<
at
::
Half
>
();
at
::
Half
*
w
=
inputs
[
1
].
data_ptr
<
at
::
Half
>
();
at
::
Half
*
z
=
inputs
[
4
].
data_ptr
<
at
::
Half
>
();
at
::
Half
*
b
=
inputs
[
7
].
data_ptr
<
at
::
Half
>
();
auto
out1
=
at
::
empty
(
outdim1
,
inputs
[
0
].
type
(),
output_format
);
at
::
Half
*
y1
=
out1
.
data_ptr
<
at
::
Half
>
();
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
run_conv_scale_bias_add_activation
(
dimA
,
.
setHandle
(
handle_
)
padA
,
.
setOperationGraph
(
ops
.
size
(),
ops
.
data
())
convstride1X1
,
.
build
();
dilationA
,
filterdimA1
,
outdimA1
,
CUDNN_DATA_HALF
,
x
,
w
,
y1
,
z
,
b
,
nullptr
);
// Create string encoding for plan caching
DEBUG_MSG
(
"[DEBUG] new relu1 : "
<<
out1
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
auto
cache_string
=
getConvFusionString
(
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
dataType
,
opGraph
.
getTag
());
DEBUG_CUDNN_MSG
(
log_buf
,
"[convstring] "
<<
cache_string
);
auto
&
plan
=
getOrCreatePlan
(
handle_
,
log_buf
,
opGraph
,
cache_string
);
w
=
inputs
[
2
].
data_ptr
<
at
::
Half
>
();
DEBUG_CUDNN_MSG
(
log_buf
,
"Plan tag: "
<<
plan
.
getTag
());
z
=
inputs
[
5
].
data_ptr
<
at
::
Half
>
();
b
=
inputs
[
8
].
data_ptr
<
at
::
Half
>
();
auto
out2
=
at
::
empty
(
outdim2
,
inputs
[
0
].
type
(),
output_format
);
at
::
Half
*
y2
=
out2
.
data_ptr
<
at
::
Half
>
();
auto
workspace_size
=
plan
.
getWorkspaceSize
();
run_conv_scale_bias_add_activation
(
outdimA1
,
DEBUG_CUDNN_MSG
(
log_buf
,
plan
.
describe
()
<<
" requires workspace "
<<
workspace_size
);
padA1
,
convstrideA
,
dilationA
,
filterdimA2
,
outdimA2
,
CUDNN_DATA_HALF
,
y1
,
w
,
y2
,
z
,
b
,
nullptr
);
DEBUG_MSG
(
"[DEBUG] new relu2 : "
<<
out2
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
void
*
workspace_ptr
=
nullptr
;
// create output of conv3
auto
workspace_tensor
=
at
::
empty
({(
workspace_size
+
3
)
/
4
},
at
::
TensorOptions
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
auto
out3
=
at
::
empty
(
outdim3
,
inputs
[
0
].
type
(),
output_format
);
if
(
workspace_size
>
0
)
{
at
::
Half
*
y3
=
out3
.
data_ptr
<
at
::
Half
>
();
workspace_ptr
=
workspace_tensor
.
data_ptr
<
float
>
();
}
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
,
devPtrZ
,
devPtrR
,
devPtrI
};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
,
's'
,
'r'
,
'i'
};
auto
variantPack
=
cudnn_frontend
::
VariantPackBuilder
()
.
setWorkspacePointer
(
workspace_ptr
)
.
setDataPointers
(
6
,
data_ptrs
)
.
setUids
(
6
,
uids
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
"variantPack "
<<
variantPack
.
describe
());
cudnnStatus_t
status
=
cudnnBackendExecute
(
handle_
,
plan
.
get_raw_desc
(),
variantPack
.
get_raw_desc
());
checkCudnnErr
(
status
);
cudnn_frontend
::
throw_if
([
status
]()
{
return
(
status
!=
CUDNN_STATUS_SUCCESS
);
},
"Plan execute error"
,
status
);
}
catch
(
cudnn_frontend
::
cudnnException
e
)
{
std
::
cout
<<
log_buf
.
str
()
<<
"[ERROR] Exception "
<<
e
.
what
()
<<
std
::
endl
;
}
}
void
// create output of conv4 that may exist
run_dconv_drelu_dscale_mask
(
int64_t
*
x_dim_padded
,
auto
identity
=
at
::
empty_like
(
out3
);
int64_t
*
pad
,
at
::
Half
*
yi
=
identity
.
data_ptr
<
at
::
Half
>
();
int64_t
*
convstride
,
int64_t
*
dilation
,
int64_t
*
w_dim_padded
,
int64_t
*
y_dim_padded
,
int64_t
*
threshold_dim
,
cudnnDataType_t
dataType
,
at
::
Half
*
devPtrX
,
at
::
Half
*
devPtrW
,
at
::
Half
*
devPtrY
,
at
::
Half
*
devPtrZ
,
at
::
Half
*
devPtrR
,
int
*
devPtrT
,
int
*
devPtrU
,
int
axis
)
{
cudnnHandle_t
handle_
=
torch
::
native
::
getCudnnHandle
();
std
::
stringstream
log_buf
;
try
{
int
convDim
=
2
;
// Creates the necessary tensor descriptors
if
(
stride_1X1
!=
1
||
filterdimA3
[
0
]
!=
dimA
[
1
]){
dconv_mask_descriptors
tensors
=
create_dconv_mask_descriptors
(
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
y_dim_padded
,
threshold_dim
,
dataType
);
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DY_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
W_OR_DW_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
SCALE_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
RELU_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_OPTIONAL_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_GEN_INDEX_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_MASK_TOP_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_MASK_BOTTOM_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_MASK_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_THRESHOLD_TOP_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_THRESHOLD_BOTTOM_TENSOR
>
(
tensors
).
describe
());
// Define the convolution problem
w
=
inputs
[
10
].
data_ptr
<
at
::
Half
>
();
auto
convDesc
=
cudnn_frontend
::
ConvDescBuilder
()
z
=
inputs
[
11
].
data_ptr
<
at
::
Half
>
();
.
setDataType
(
CUDNN_DATA_FLOAT
)
b
=
inputs
[
12
].
data_ptr
<
at
::
Half
>
();
.
setMathMode
(
CUDNN_CROSS_CORRELATION
)
run_conv_scale_bias
(
dimA
,
.
setNDims
(
convDim
)
padA
,
.
setStrides
(
convDim
,
convstride
)
convstride1X1
,
.
setPrePadding
(
convDim
,
pad
)
dilationA
,
.
setPostPadding
(
convDim
,
pad
)
filterdimA4
,
.
setDilation
(
convDim
,
dilation
)
outdimA3
,
.
build
();
CUDNN_DATA_HALF
,
DEBUG_CUDNN_MSG
(
log_buf
,
convDesc
.
describe
());
x
,
w
,
yi
,
z
,
b
);
DEBUG_MSG
(
"[DEBUG] new downsample : "
<<
identity
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
}
else
{
yi
=
x
;
}
// Define the activation backward operation
w
=
inputs
[
3
].
data_ptr
<
at
::
Half
>
();
auto
actDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
z
=
inputs
[
6
].
data_ptr
<
at
::
Half
>
();
.
setMode
(
CUDNN_POINTWISE_RELU_BWD
)
b
=
inputs
[
9
].
data_ptr
<
at
::
Half
>
();
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
actDesc
.
describe
());
// Define the scale backward operation
run_conv_scale_bias_add_activation
(
outdimA2
,
auto
scaleDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
padA
,
.
setMode
(
CUDNN_POINTWISE_MUL
)
convstrideA
,
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
dilationA
,
.
build
();
filterdimA3
,
DEBUG_CUDNN_MSG
(
log_buf
,
scaleDesc
.
describe
());
outdimA3
,
CUDNN_DATA_HALF
,
y2
,
w
,
y3
,
z
,
b
,
yi
);
DEBUG_MSG
(
"[DEBUG] new relu3 : "
<<
out3
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
// Define the genIndex descriptor
outputs
.
push_back
(
out1
);
auto
genIndexDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
outputs
.
push_back
(
out2
);
.
setMode
(
CUDNN_POINTWISE_GEN_INDEX
)
outputs
.
push_back
(
out3
);
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
setAxis
(
axis
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
genIndexDesc
.
describe
());
// Define the lessThan descriptor
return
outputs
;
auto
lessThanDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
}
.
setMode
(
CUDNN_POINTWISE_CMP_LT
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
lessThanDesc
.
describe
());
// Define the greaterThan descriptor
std
::
vector
<
at
::
Tensor
>
bottleneck_backward
(
bool
explicit_nhwc
,
int
stride_1X1
,
std
::
vector
<
at
::
Tensor
>
inputs
)
{
auto
greaterThanDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_CMP_GT
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
greaterThanDesc
.
describe
());
// Define the logical_or descriptor
bool
requires_grad
=
inputs
[
0
].
requires_grad
();
auto
logicalOrDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_LOGICAL_OR
)
.
setMathPrecision
(
CUDNN_DATA_BOOLEAN
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
logicalOrDesc
.
describe
());
// Define the binary_selection descriptor
std
::
cout
<<
std
::
fixed
;
auto
selectionDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
// create output vector
.
setMode
(
CUDNN_POINTWISE_BINARY_SELECT
)
std
::
vector
<
at
::
Tensor
>
outputs
;
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
auto
output_format
=
explicit_nhwc
?
at
::
MemoryFormat
::
Contiguous
:
at
::
MemoryFormat
::
ChannelsLast
;
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
selectionDesc
.
describe
());
float
alpha
=
1.0
f
;
// setup dimensions
float
beta
=
0.0
f
;
int64_t
dimA
[]
=
{
0
,
0
,
0
,
0
};
int64_t
filterdimA1
[]
=
{
0
,
0
,
0
,
0
};
int64_t
filterdimA2
[]
=
{
0
,
0
,
0
,
0
};
int64_t
filterdimA3
[]
=
{
0
,
0
,
0
,
0
};
int64_t
filterdimA4
[]
=
{
0
,
0
,
0
,
0
};
// Create a convolution Node
// All dim calculation after this order of n,c,h,w
auto
conv_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
)
int
axis
[]
{
0
,
1
,
2
,
3
};
.
setdxDesc
(
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
))
if
(
explicit_nhwc
)
{
.
setwDesc
(
std
::
get
<
W_OR_DW_TENSOR
>
(
tensors
))
axis
[
0
]
=
0
;
.
setdyDesc
(
std
::
get
<
DY_TENSOR
>
(
tensors
))
axis
[
1
]
=
3
;
.
setcDesc
(
convDesc
)
axis
[
2
]
=
1
;
.
setAlpha
(
alpha
)
axis
[
3
]
=
2
;
.
setBeta
(
beta
)
}
.
build
();
for
(
int
dim
=
0
;
dim
<
4
;
dim
++
)
{
DEBUG_CUDNN_MSG
(
log_buf
,
conv_op
.
describe
());
dimA
[
dim
]
=
inputs
[
0
].
size
(
axis
[
dim
]);
filterdimA1
[
dim
]
=
inputs
[
1
].
size
(
axis
[
dim
]);
filterdimA2
[
dim
]
=
inputs
[
2
].
size
(
axis
[
dim
]);
filterdimA3
[
dim
]
=
inputs
[
3
].
size
(
axis
[
dim
]);
}
if
(
stride_1X1
!=
1
||
filterdimA3
[
0
]
!=
dimA
[
1
])
{
for
(
int
dim
=
0
;
dim
<
4
;
dim
++
)
{
filterdimA4
[
dim
]
=
inputs
[
14
].
size
(
axis
[
dim
]);
}
}
// TODO: do we need getOutputTensor(), and what it returns in backward case?
// output dim in n,c,h,w used by backend
// Create an relu backward Node.
int64_t
outdimA1
[]
=
{
0
,
0
,
0
,
0
};
// Computed Below
auto
act_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
int64_t
outdimA2
[]
=
{
0
,
0
,
0
,
0
};
// Computed Below
.
setdyDesc
(
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
))
int64_t
outdimA3
[]
=
{
0
,
0
,
0
,
0
};
// Computed Below
.
setxDesc
(
std
::
get
<
RELU_TENSOR
>
(
tensors
))
.
setdxDesc
(
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
))
.
setpwDesc
(
actDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
act_op
.
describe
());
// Create a Scale Node.
// use these fixed value for test run
auto
scale_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
int64_t
padA
[]
=
{
0
,
0
};
.
setxDesc
(
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
))
int64_t
padA1
[]
=
{
1
,
1
};
.
setbDesc
(
std
::
get
<
SCALE_TENSOR
>
(
tensors
))
int64_t
dilationA
[]
=
{
1
,
1
};
.
setyDesc
(
std
::
get
<
DGRAD_OPTIONAL_TENSOR
>
(
tensors
))
int64_t
convstrideA
[]
=
{
1
,
1
};
.
setpwDesc
(
scaleDesc
)
int64_t
convstride1X1
[]
=
{
stride_1X1
,
stride_1X1
};
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
scale_op
.
describe
());
// Create a Gen_Index Node.
// compute output from pad/stride/dilation
auto
genIndex_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
outdimA1
[
0
]
=
dimA
[
0
];
.
setxDesc
(
std
::
get
<
DGRAD_OPTIONAL_TENSOR
>
(
tensors
))
outdimA1
[
1
]
=
filterdimA1
[
0
];
.
setyDesc
(
std
::
get
<
DGRAD_GEN_INDEX_TENSOR
>
(
tensors
))
for
(
int
dim
=
0
;
dim
<
2
;
dim
++
)
{
.
setpwDesc
(
genIndexDesc
)
outdimA1
[
dim
+
2
]
=
getFwdConvOutputDim
(
dimA
[
dim
+
2
],
padA
[
dim
],
filterdimA1
[
dim
+
2
],
convstride1X1
[
dim
],
dilationA
[
dim
]);
.
build
();
}
DEBUG_CUDNN_MSG
(
log_buf
,
genIndex_op
.
describe
());
// Create a LessThan Node.
outdimA2
[
0
]
=
outdimA1
[
0
];
auto
lessThan_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
outdimA2
[
1
]
=
filterdimA2
[
0
];
.
setxDesc
(
std
::
get
<
DGRAD_GEN_INDEX_TENSOR
>
(
tensors
))
for
(
int
dim
=
0
;
dim
<
2
;
dim
++
)
{
.
setbDesc
(
std
::
get
<
DGRAD_THRESHOLD_TOP_TENSOR
>
(
tensors
))
outdimA2
[
dim
+
2
]
=
getFwdConvOutputDim
(
outdimA1
[
dim
+
2
],
padA1
[
dim
],
filterdimA2
[
dim
+
2
],
convstrideA
[
dim
],
dilationA
[
dim
]);
.
setyDesc
(
std
::
get
<
DGRAD_MASK_TOP_TENSOR
>
(
tensors
))
}
.
setpwDesc
(
lessThanDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
lessThan_op
.
describe
());
// Create a GreaterThan Node.
outdimA3
[
0
]
=
outdimA2
[
0
];
auto
greaterThan_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
outdimA3
[
1
]
=
filterdimA3
[
0
];
.
setxDesc
(
std
::
get
<
DGRAD_GEN_INDEX_TENSOR
>
(
tensors
))
for
(
int
dim
=
0
;
dim
<
2
;
dim
++
)
{
.
setbDesc
(
std
::
get
<
DGRAD_THRESHOLD_BOTTOM_TENSOR
>
(
tensors
))
outdimA3
[
dim
+
2
]
=
getFwdConvOutputDim
(
outdimA2
[
dim
+
2
],
padA
[
dim
],
filterdimA3
[
dim
+
2
],
convstrideA
[
dim
],
dilationA
[
dim
]);
.
setyDesc
(
std
::
get
<
DGRAD_MASK_BOTTOM_TENSOR
>
(
tensors
))
}
.
setpwDesc
(
greaterThanDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
greaterThan_op
.
describe
());
// Create a LogicalOr Node.
// Create output tensor in the correct shape in pytorch's view
auto
logicalOr_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
int64_t
outdim1
[]
=
{
0
,
0
,
0
,
0
};
.
setxDesc
(
std
::
get
<
DGRAD_MASK_TOP_TENSOR
>
(
tensors
))
int64_t
outdim2
[]
=
{
0
,
0
,
0
,
0
};
.
setbDesc
(
std
::
get
<
DGRAD_MASK_BOTTOM_TENSOR
>
(
tensors
))
int64_t
outdim3
[]
=
{
0
,
0
,
0
,
0
};
.
setyDesc
(
std
::
get
<
DGRAD_MASK_TENSOR
>
(
tensors
))
if
(
explicit_nhwc
)
{
.
setpwDesc
(
logicalOrDesc
)
axis
[
0
]
=
0
;
.
build
();
axis
[
1
]
=
2
;
DEBUG_CUDNN_MSG
(
log_buf
,
logicalOr_op
.
describe
());
axis
[
2
]
=
3
;
axis
[
3
]
=
1
;
}
for
(
int
dim
=
0
;
dim
<
4
;
dim
++
)
{
outdim1
[
dim
]
=
outdimA1
[
axis
[
dim
]];
outdim2
[
dim
]
=
outdimA2
[
axis
[
dim
]];
outdim3
[
dim
]
=
outdimA3
[
axis
[
dim
]];
}
// Create a Binary_Selection Node.
// dconv3+drelu2+dscale2
auto
selection_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
at
::
Half
*
conv_in
=
inputs
[
13
].
data_ptr
<
at
::
Half
>
();
.
setxDesc
(
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
))
at
::
Half
*
dy3
=
inputs
[
10
].
data_ptr
<
at
::
Half
>
();
.
setbDesc
(
std
::
get
<
DGRAD_OPTIONAL_TENSOR
>
(
tensors
))
.
settDesc
(
std
::
get
<
DGRAD_MASK_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
))
.
setpwDesc
(
selectionDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
selection_op
.
describe
());
// Create an Operation Graph. In this case it is convolution add bias activation
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
8
>
ops
=
{
&
conv_op
,
&
act_op
,
&
scale_op
,
&
genIndex_op
,
&
lessThan_op
,
&
greaterThan_op
,
&
logicalOr_op
,
&
selection_op
};
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
.
setHandle
(
handle_
)
.
setOperationGraph
(
ops
.
size
(),
ops
.
data
())
.
build
();
// Create string encoding for plan caching
auto
cache_string
=
getConvFusionString
(
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
dataType
,
opGraph
.
getTag
());
DEBUG_CUDNN_MSG
(
log_buf
,
"[convstring] "
<<
cache_string
);
auto
&
plan
=
getOrCreatePlan
(
handle_
,
log_buf
,
opGraph
,
cache_string
);
DEBUG_MSG
(
"[DEBUG] new dconv3 : "
<<
inputs
[
10
].
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
DEBUG_CUDNN_MSG
(
log_buf
,
"Plan tag: "
<<
plan
.
getTag
());
auto
workspace_size
=
plan
.
getWorkspaceSize
();
// wgrad
DEBUG_CUDNN_MSG
(
log_buf
,
plan
.
describe
()
<<
" requires workspace "
<<
workspace_size
);
auto
wgrad3
=
at
::
empty_like
(
inputs
[
3
]);
at
::
Half
*
dw3
=
wgrad3
.
data_ptr
<
at
::
Half
>
();
run_dconv
(
outdimA2
,
padA
,
convstrideA
,
dilationA
,
filterdimA3
,
outdimA3
,
CUDNN_DATA_HALF
,
conv_in
,
dw3
,
dy3
,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
);
void
*
workspace_ptr
=
nullptr
;
// dgrad
auto
workspace_tensor
=
at
::
empty
({(
workspace_size
+
3
)
/
4
},
at
::
TensorOptions
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
auto
grad_out2
=
at
::
empty
(
outdim2
,
inputs
[
0
].
type
(),
output_format
);
if
(
workspace_size
>
0
)
{
at
::
Half
*
dy2
=
grad_out2
.
data_ptr
<
at
::
Half
>
();
workspace_ptr
=
workspace_tensor
.
data_ptr
<
float
>
();
at
::
Half
*
w
=
inputs
[
3
].
data_ptr
<
at
::
Half
>
();
}
at
::
Half
*
z
=
inputs
[
5
].
data_ptr
<
at
::
Half
>
();
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
,
devPtrZ
,
devPtrR
,
devPtrT
,
devPtrU
};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
,
's'
,
'r'
,
't'
,
'u'
};
auto
variantPack
=
cudnn_frontend
::
VariantPackBuilder
()
.
setWorkspacePointer
(
workspace_ptr
)
.
setDataPointers
(
7
,
data_ptrs
)
.
setUids
(
7
,
uids
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
"variantPack "
<<
variantPack
.
describe
());
cudnnStatus_t
status
=
cudnnBackendExecute
(
handle_
,
plan
.
get_raw_desc
(),
variantPack
.
get_raw_desc
());
checkCudnnErr
(
status
);
cudnn_frontend
::
throw_if
([
status
]()
{
return
(
status
!=
CUDNN_STATUS_SUCCESS
);
},
"Plan execute error"
,
status
);
}
catch
(
cudnn_frontend
::
cudnnException
e
)
{
std
::
cout
<<
log_buf
.
str
()
<<
"[ERROR] Exception "
<<
e
.
what
()
<<
std
::
endl
;
}
}
void
at
::
Half
*
relu2
=
inputs
[
13
].
data_ptr
<
at
::
Half
>
();
run_dconv
(
int64_t
*
x_dim_padded
,
int64_t
*
pad
,
int64_t
*
convstride
,
int64_t
*
dilation
,
int64_t
*
w_dim_padded
,
int64_t
*
y_dim_padded
,
cudnnDataType_t
dataType
,
at
::
Half
*
devPtrX
,
at
::
Half
*
devPtrW
,
at
::
Half
*
devPtrY
,
cudnnBackendDescriptorType_t
mode
)
{
cudnnHandle_t
handle_
=
torch
::
native
::
getCudnnHandle
();
std
::
stringstream
log_buf
;
try
{
int
convDim
=
2
;
// Creates the necessary tensor descriptors
run_dconv_drelu_dscale
(
outdimA2
,
dconv_descriptors
tensors
=
create_dconv_descriptors
(
padA
,
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
y_dim_padded
,
dataType
);
convstrideA
,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
).
describe
());
dilationA
,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DY_TENSOR
>
(
tensors
).
describe
());
filterdimA3
,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
W_OR_DW_TENSOR
>
(
tensors
).
describe
());
outdimA3
,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
SCALE_TENSOR
>
(
tensors
).
describe
());
CUDNN_DATA_HALF
,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
RELU_TENSOR
>
(
tensors
).
describe
());
dy2
,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
).
describe
());
w
,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
).
describe
());
dy3
,
z
,
relu2
);
// Define the convolution problem
DEBUG_MSG
(
"[DEBUG] new dconv2 : "
<<
grad_out2
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
auto
convDesc
=
cudnn_frontend
::
ConvDescBuilder
()
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
setMathMode
(
CUDNN_CROSS_CORRELATION
)
.
setNDims
(
convDim
)
.
setStrides
(
convDim
,
convstride
)
.
setPrePadding
(
convDim
,
pad
)
.
setPostPadding
(
convDim
,
pad
)
.
setDilation
(
convDim
,
dilation
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
convDesc
.
describe
());
float
alpha
=
1.0
f
;
// dconv2+drelu1+dscale1
float
beta
=
0.0
f
;
conv_in
=
inputs
[
12
].
data_ptr
<
at
::
Half
>
()
;
// Create a convolution Node
// wgrad
// mode should be one of following
auto
wgrad2
=
at
::
empty_like
(
inputs
[
2
]);
// CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
at
::
Half
*
dw2
=
wgrad2
.
data_ptr
<
at
::
Half
>
();
// CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
run_dconv
(
outdimA1
,
auto
conv_op_builder
=
cudnn_frontend
::
OperationBuilder
(
mode
);
padA1
,
if
(
mode
==
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
)
{
convstrideA
,
conv_op_builder
.
setdxDesc
(
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
))
dilationA
,
.
setwDesc
(
std
::
get
<
W_OR_DW_TENSOR
>
(
tensors
))
filterdimA2
,
.
setdyDesc
(
std
::
get
<
DY_TENSOR
>
(
tensors
))
outdimA2
,
.
setcDesc
(
convDesc
)
CUDNN_DATA_HALF
,
.
setAlpha
(
alpha
)
conv_in
,
.
setBeta
(
beta
);
dw2
,
}
dy2
,
else
{
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
);
conv_op_builder
.
setxDesc
(
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
))
.
setdwDesc
(
std
::
get
<
W_OR_DW_TENSOR
>
(
tensors
))
.
setdyDesc
(
std
::
get
<
DY_TENSOR
>
(
tensors
))
.
setcDesc
(
convDesc
)
.
setAlpha
(
alpha
)
.
setBeta
(
beta
);
}
auto
conv_op
=
conv_op_builder
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
conv_op
.
describe
());
// Create an Operation Graph. In this case it is convolution add bias activation
// dgrad
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
1
>
ops
=
{
&
conv_op
};
auto
grad_out1
=
at
::
empty
(
outdim1
,
inputs
[
0
].
type
(),
output_format
);
at
::
Half
*
dy1
=
grad_out1
.
data_ptr
<
at
::
Half
>
();
w
=
inputs
[
2
].
data_ptr
<
at
::
Half
>
();
z
=
inputs
[
4
].
data_ptr
<
at
::
Half
>
();
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
at
::
Half
*
relu1
=
inputs
[
12
].
data_ptr
<
at
::
Half
>
();
.
setHandle
(
handle_
)
// fused dgrad
.
setOperationGraph
(
ops
.
size
(),
ops
.
data
())
run_dconv_drelu_dscale
(
outdimA1
,
.
build
();
padA1
,
convstrideA
,
dilationA
,
filterdimA2
,
outdimA2
,
CUDNN_DATA_HALF
,
dy1
,
w
,
dy2
,
z
,
relu1
);
// Create string encoding for plan caching
/*
auto
cache_string
=
getConvFusionString
(
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
dataType
,
opGraph
.
getTag
());
// backward strided conv cannot be fused
DEBUG_CUDNN_MSG
(
log_buf
,
"[convstring] "
<<
cache_string
);
// if stride == 1 but channel changes, we can fuse here
if (stride_1X1 != 1){
// dgrad
run_dconv(outdimA1,
padA1,
convstride1X1,
dilationA,
filterdimA2,
outdimA2,
CUDNN_DATA_HALF,
dy1,
w,
dy2,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);
auto
&
plan
=
getOrCreatePlan
(
handle_
,
log_buf
,
opGraph
,
cache_string
);
// mul fused mask
DEBUG_CUDNN_MSG
(
log_buf
,
"Plan tag: "
<<
plan
.
getTag
());
grad_out1.mul_(inputs[15]);
}
else {
at::Half* relu1 = inputs[12].data_ptr<at::Half>();
// fused dgrad
run_dconv_drelu_dscale(outdimA1,
padA1,
convstride1X1,
dilationA,
filterdimA2,
outdimA2,
CUDNN_DATA_HALF,
dy1,
w,
dy2,
z,
relu1);
}
*/
DEBUG_MSG
(
"[DEBUG] new dconv1 : "
<<
grad_out1
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
auto
workspace_size
=
plan
.
getWorkspaceSize
();
// create grads of conv4 that may exist
DEBUG_CUDNN_MSG
(
log_buf
,
plan
.
describe
()
<<
" requires workspace "
<<
workspace_size
);
auto
grad_x_conv4
=
at
::
empty_like
(
inputs
[
0
]);
at
::
Half
*
dx_conv4
=
grad_x_conv4
.
data_ptr
<
at
::
Half
>
();
at
::
Tensor
wgrad4
;
void
*
workspace_ptr
=
nullptr
;
// x used for dconv1 and dconv4 wgrad
auto
workspace_tensor
=
at
::
empty
({(
workspace_size
+
3
)
/
4
},
at
::
TensorOptions
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
at
::
Half
*
x
=
inputs
[
0
].
data_ptr
<
at
::
Half
>
();
if
(
workspace_size
>
0
)
{
workspace_ptr
=
workspace_tensor
.
data_ptr
<
float
>
();
}
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
};
auto
variantPack
=
cudnn_frontend
::
VariantPackBuilder
()
.
setWorkspacePointer
(
workspace_ptr
)
.
setDataPointers
(
3
,
data_ptrs
)
.
setUids
(
3
,
uids
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
"variantPack "
<<
variantPack
.
describe
());
cudnnStatus_t
status
=
cudnnBackendExecute
(
handle_
,
plan
.
get_raw_desc
(),
variantPack
.
get_raw_desc
());
checkCudnnErr
(
status
);
cudnn_frontend
::
throw_if
([
status
]()
{
return
(
status
!=
CUDNN_STATUS_SUCCESS
);
},
"Plan execute error"
,
status
);
}
catch
(
cudnn_frontend
::
cudnnException
e
)
{
std
::
cout
<<
log_buf
.
str
()
<<
"[ERROR] Exception "
<<
e
.
what
()
<<
std
::
endl
;
}
}
void
if
(
stride_1X1
!=
1
||
filterdimA3
[
0
]
!=
dimA
[
1
]){
run_dconv_add
(
int64_t
*
x_dim_padded
,
w
=
inputs
[
14
].
data_ptr
<
at
::
Half
>
();
int64_t
*
pad
,
at
::
Half
*
dy_conv4
=
inputs
[
11
].
data_ptr
<
at
::
Half
>
();
int64_t
*
convstride
,
if
(
requires_grad
)
{
int64_t
*
dilation
,
run_dconv
(
dimA
,
int64_t
*
w_dim_padded
,
padA
,
int64_t
*
y_dim_padded
,
convstride1X1
,
cudnnDataType_t
dataType
,
dilationA
,
at
::
Half
*
devPtrX
,
filterdimA4
,
at
::
Half
*
devPtrW
,
outdimA3
,
at
::
Half
*
devPtrY
,
CUDNN_DATA_HALF
,
at
::
Half
*
devPtrR
)
{
dx_conv4
,
w
,
dy_conv4
,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
);
// we don't print here since we can't hook out this grad in pytorch alone to compare, due to addition with dx
// DEBUG_MSG("[DEBUG] new dx_identity : " << grad_x_conv4.to(at::kFloat).sum().item<float>());
}
// wgrad
wgrad4
=
at
::
empty_like
(
inputs
[
14
]);
at
::
Half
*
dw4
=
wgrad4
.
data_ptr
<
at
::
Half
>
();
run_dconv
(
dimA
,
padA
,
convstride1X1
,
dilationA
,
filterdimA4
,
outdimA3
,
CUDNN_DATA_HALF
,
x
,
dw4
,
dy_conv4
,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
);
}
else
{
// if there is no downsample, dx_conv4 is fork of drelu3
dx_conv4
=
inputs
[
11
].
data_ptr
<
at
::
Half
>
();
}
// dconv1+add
// wgrad
auto
wgrad1
=
at
::
empty_like
(
inputs
[
1
]);
at
::
Half
*
dw1
=
wgrad1
.
data_ptr
<
at
::
Half
>
();
run_dconv
(
dimA
,
padA
,
convstride1X1
,
dilationA
,
filterdimA1
,
outdimA1
,
CUDNN_DATA_HALF
,
x
,
dw1
,
dy1
,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
);
// dgrad
w
=
inputs
[
1
].
data_ptr
<
at
::
Half
>
();
auto
grad_x
=
at
::
empty_like
(
inputs
[
0
]);
at
::
Half
*
dx
=
grad_x
.
data_ptr
<
at
::
Half
>
();
// backward strided conv cannot be fused
// if stride == 1 but channel changes, we can fuse here
if
(
requires_grad
){
if
(
stride_1X1
!=
1
){
run_dconv
(
dimA
,
padA
,
convstride1X1
,
dilationA
,
filterdimA1
,
outdimA1
,
CUDNN_DATA_HALF
,
dx
,
w
,
dy1
,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
);
// add 2 together
grad_x
.
add_
(
grad_x_conv4
);
}
else
{
run_dconv_add
(
dimA
,
padA
,
convstride1X1
,
dilationA
,
filterdimA1
,
outdimA1
,
CUDNN_DATA_HALF
,
dx
,
w
,
dy1
,
dx_conv4
);
}
}
DEBUG_MSG
(
"[DEBUG] new dx : "
<<
grad_x
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
DEBUG_MSG
(
"[DEBUG] new wgrad1 : "
<<
wgrad1
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
DEBUG_MSG
(
"[DEBUG] new wgrad2 : "
<<
wgrad2
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
DEBUG_MSG
(
"[DEBUG] new wgrad3 : "
<<
wgrad3
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
outputs
.
push_back
(
grad_x
);
outputs
.
push_back
(
wgrad1
);
outputs
.
push_back
(
wgrad2
);
outputs
.
push_back
(
wgrad3
);
if
(
stride_1X1
!=
1
||
filterdimA3
[
0
]
!=
dimA
[
1
])
{
DEBUG_MSG
(
"[DEBUG] new wgrad4 : "
<<
wgrad4
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
outputs
.
push_back
(
wgrad4
);
}
return
outputs
;
}
namespace
{
enum
{
X_TENSOR
,
Y_TENSOR
,
W_TENSOR
,
Z_TENSOR
,
B_TENSOR
,
AFTERADD_TENSOR
,
AFTERBIAS_TENSOR
,
AFTERCONV_TENSOR
,
OPTIONAL
,
AFTEROPT_TENSOR
,
AFTERACT_TENSOR
,
GEN_INDEX_TENSOR
,
MASK_TOP_TENSOR
,
MASK_BOTTOM_TENSOR
,
MASK_TENSOR
,
THRESHOLD_TOP_TENSOR
,
THRESHOLD_BOTTOM_TENSOR
,
};
using
masked_convbias_descriptors
=
std
::
tuple
<
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
>
;
masked_convbias_descriptors
create_conv_bias_add_act_mask_descriptors
(
int64_t
*
x_dim_padded
,
int64_t
*
padA
,
int64_t
*
convstrideA
,
int64_t
*
dilationA
,
int64_t
*
w_dim_padded
,
int64_t
*
y_dim_padded
,
int64_t
*
threshold_dim
,
cudnnDataType_t
dataType
)
{
const
int
convDim
=
2
;
int64_t
b_dim_padded
[
4
];
b_dim_padded
[
0
]
=
1
;
b_dim_padded
[
1
]
=
y_dim_padded
[
1
];
b_dim_padded
[
2
]
=
1
;
b_dim_padded
[
3
]
=
1
;
int64_t
x_stride_padded
[
4
];
int64_t
y_stride_padded
[
4
];
int64_t
w_stride_padded
[
4
];
int64_t
b_stride_padded
[
4
];
int64_t
threshold_stride
[
4
];
generateStrides
(
w_dim_padded
,
w_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
x_dim_padded
,
x_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
y_dim_padded
,
y_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
b_dim_padded
,
b_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
threshold_dim
,
threshold_stride
,
4
,
CUDNN_TENSOR_NHWC
);
return
masked_convbias_descriptors
(
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
x_dim_padded
)
.
setStrides
(
4
,
x_stride_padded
)
.
setId
(
'x'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'y'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
w_dim_padded
)
.
setStrides
(
4
,
w_stride_padded
)
.
setId
(
'w'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
b_dim_padded
)
.
setStrides
(
4
,
b_stride_padded
)
.
setId
(
'z'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
b_dim_padded
)
.
setStrides
(
4
,
b_stride_padded
)
.
setId
(
'b'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setVirtual
()
.
setId
(
'A'
)
// after add
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setVirtual
()
.
setId
(
'B'
)
// after bias
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'C'
)
// after conv
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'i'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'D'
)
// after optional add
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'E'
)
// after act for masked
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'I'
)
// output of the gen index operation
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_INT32
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'm'
)
// top half of the mask created after the less than
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_BOOLEAN
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'n'
)
// bottom half of the mask
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_BOOLEAN
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'M'
)
// OR of the top and bottom masks
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_BOOLEAN
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
threshold_dim
)
.
setStrides
(
4
,
threshold_stride
)
.
setId
(
't'
)
// threshold for creating the top mask
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_INT32
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
threshold_dim
)
.
setStrides
(
4
,
threshold_stride
)
.
setId
(
'u'
)
// threshold for creating the bottom mask
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_INT32
)
.
build
());
}
// tensor descriptors used for dgrad
enum
{
X_OR_DX_TENSOR
,
DY_TENSOR
,
W_OR_DW_TENSOR
,
SCALE_TENSOR
,
RELU_TENSOR
,
AFTER_DCONV_TENSOR
,
AFTER_DRELU_TENSOR
,
DGRAD_INPUT_TENSOR
,
DGRAD_OPTIONAL_TENSOR
,
DGRAD_GEN_INDEX_TENSOR
,
DGRAD_MASK_TOP_TENSOR
,
DGRAD_MASK_BOTTOM_TENSOR
,
DGRAD_MASK_TENSOR
,
DGRAD_THRESHOLD_TOP_TENSOR
,
DGRAD_THRESHOLD_BOTTOM_TENSOR
,
};
using
dconv_add_descriptors
=
std
::
tuple
<
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
>
;
dconv_add_descriptors
create_dconv_add_descriptors
(
int64_t
*
x_dim_padded
,
int64_t
*
padA
,
int64_t
*
convstrideA
,
int64_t
*
dilationA
,
int64_t
*
w_dim_padded
,
int64_t
*
y_dim_padded
,
cudnnDataType_t
dataType
)
{
const
int
convDim
=
2
;
int64_t
b_dim_padded
[
4
];
b_dim_padded
[
0
]
=
1
;
b_dim_padded
[
1
]
=
x_dim_padded
[
1
];
b_dim_padded
[
2
]
=
1
;
b_dim_padded
[
3
]
=
1
;
int64_t
x_stride_padded
[
4
];
int64_t
y_stride_padded
[
4
];
int64_t
w_stride_padded
[
4
];
int64_t
b_stride_padded
[
4
];
generateStrides
(
w_dim_padded
,
w_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
x_dim_padded
,
x_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
y_dim_padded
,
y_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
b_dim_padded
,
b_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
return
dconv_add_descriptors
(
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
x_dim_padded
)
.
setStrides
(
4
,
x_stride_padded
)
.
setId
(
'x'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'y'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
w_dim_padded
)
.
setStrides
(
4
,
w_stride_padded
)
.
setId
(
'w'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
b_dim_padded
)
.
setStrides
(
4
,
b_stride_padded
)
.
setId
(
's'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
x_dim_padded
)
.
setStrides
(
4
,
x_stride_padded
)
.
setId
(
'r'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
x_dim_padded
)
.
setStrides
(
4
,
x_stride_padded
)
.
setVirtual
()
.
setId
(
'A'
)
// after dconv
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
x_dim_padded
)
.
setStrides
(
4
,
x_stride_padded
)
.
setVirtual
()
.
setId
(
'B'
)
// after drelu
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'i'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'D'
)
// after optional add
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
());
}
using
dconv_mask_descriptors
=
std
::
tuple
<
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
>
;
dconv_mask_descriptors
create_dconv_mask_descriptors
(
int64_t
*
x_dim_padded
,
int64_t
*
padA
,
int64_t
*
convstrideA
,
int64_t
*
dilationA
,
int64_t
*
w_dim_padded
,
int64_t
*
y_dim_padded
,
int64_t
*
threshold_dim
,
cudnnDataType_t
dataType
)
{
const
int
convDim
=
2
;
int64_t
b_dim_padded
[
4
];
b_dim_padded
[
0
]
=
1
;
b_dim_padded
[
1
]
=
x_dim_padded
[
1
];
b_dim_padded
[
2
]
=
1
;
b_dim_padded
[
3
]
=
1
;
int64_t
x_stride_padded
[
4
];
int64_t
y_stride_padded
[
4
];
int64_t
w_stride_padded
[
4
];
int64_t
b_stride_padded
[
4
];
int64_t
threshold_stride
[
4
];
generateStrides
(
w_dim_padded
,
w_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
x_dim_padded
,
x_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
y_dim_padded
,
y_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
b_dim_padded
,
b_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
threshold_dim
,
threshold_stride
,
4
,
CUDNN_TENSOR_NHWC
);
return
dconv_mask_descriptors
(
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
x_dim_padded
)
.
setStrides
(
4
,
x_stride_padded
)
.
setId
(
'x'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'y'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
w_dim_padded
)
.
setStrides
(
4
,
w_stride_padded
)
.
setId
(
'w'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
b_dim_padded
)
.
setStrides
(
4
,
b_stride_padded
)
.
setId
(
's'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
x_dim_padded
)
.
setStrides
(
4
,
x_stride_padded
)
.
setId
(
'r'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
x_dim_padded
)
.
setStrides
(
4
,
x_stride_padded
)
.
setVirtual
()
.
setId
(
'A'
)
// after dconv
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
x_dim_padded
)
.
setStrides
(
4
,
x_stride_padded
)
.
setVirtual
()
.
setId
(
'B'
)
// after drelu
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'i'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'D'
)
// after optional add
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'I'
)
// output of the gen index operation
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_INT32
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'm'
)
// top half of the mask created after the less than
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_BOOLEAN
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'n'
)
// bottom half of the mask
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_BOOLEAN
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'M'
)
// OR of the top and bottom masks
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_BOOLEAN
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
threshold_dim
)
.
setStrides
(
4
,
threshold_stride
)
.
setId
(
't'
)
// threshold for creating the top mask
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_INT32
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
threshold_dim
)
.
setStrides
(
4
,
threshold_stride
)
.
setId
(
'u'
)
// threshold for creating the bottom mask
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_INT32
)
.
build
());
}
void
run_conv_add_scale_bias_activation
(
int64_t
*
x_dim_padded
,
int64_t
*
pad
,
int64_t
*
convstride
,
int64_t
*
dilation
,
int64_t
*
w_dim_padded
,
int64_t
*
y_dim_padded
,
cudnnDataType_t
dataType
,
at
::
Half
*
devPtrX
,
at
::
Half
*
devPtrW
,
at
::
Half
*
devPtrY
,
at
::
Half
*
devPtrZ
,
at
::
Half
*
devPtrB
,
at
::
Half
*
devPtrI
)
{
cudnnHandle_t
handle_
=
torch
::
native
::
getCudnnHandle
();
std
::
stringstream
log_buf
;
try
{
int
convDim
=
2
;
// Creates the necessary tensor descriptors
common_convbias_descriptors
tensors
=
create_conv_bias_add_act_descriptors
(
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
y_dim_padded
,
dataType
);
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
X_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
Y_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
W_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
Z_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
B_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTERADD_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTERBIAS_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTERCONV_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
OPTIONAL
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTEROPT_TENSOR
>
(
tensors
).
describe
());
// Define the add operation
auto
scaleDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_MUL
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
scaleDesc
.
describe
());
// Define the bias operation
auto
biasDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_ADD
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
biasDesc
.
describe
());
// optional add
auto
addDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_ADD
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
addDesc
.
describe
());
// Define the activation operation
auto
actDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_RELU_FWD
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
actDesc
.
describe
());
// Define the convolution problem
auto
convDesc
=
cudnn_frontend
::
ConvDescBuilder
()
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
setMathMode
(
CUDNN_CROSS_CORRELATION
)
.
setNDims
(
convDim
)
.
setStrides
(
convDim
,
convstride
)
.
setPrePadding
(
convDim
,
pad
)
.
setPostPadding
(
convDim
,
pad
)
.
setDilation
(
convDim
,
dilation
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
convDesc
.
describe
());
float
alpha
=
1.0
f
;
float
beta
=
0.0
f
;
// Create a convolution Node
auto
conv_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR
)
.
setxDesc
(
std
::
get
<
X_TENSOR
>
(
tensors
))
.
setwDesc
(
std
::
get
<
W_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
AFTERCONV_TENSOR
>
(
tensors
))
.
setcDesc
(
convDesc
)
.
setAlpha
(
alpha
)
.
setBeta
(
beta
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
conv_op
.
describe
());
// create an add node.
auto
add_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
conv_op
.
getOutputTensor
())
.
setbDesc
(
std
::
get
<
OPTIONAL
>
(
tensors
))
.
setyDesc
(
std
::
get
<
AFTEROPT_TENSOR
>
(
tensors
))
.
setpwDesc
(
addDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
add_op
.
describe
());
// Create a Add Node with scaling parameters.
auto
scale_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
add_op
.
getOutputTensor
())
.
setbDesc
(
std
::
get
<
Z_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
AFTERADD_TENSOR
>
(
tensors
))
.
setpwDesc
(
scaleDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
scale_op
.
describe
());
// Create a Bias Node.
auto
bias_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
scale_op
.
getOutputTensor
())
.
setbDesc
(
std
::
get
<
B_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
AFTERBIAS_TENSOR
>
(
tensors
))
.
setpwDesc
(
biasDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
bias_op
.
describe
());
// Create an Activation Node.
auto
act_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
bias_op
.
getOutputTensor
())
.
setyDesc
(
std
::
get
<
Y_TENSOR
>
(
tensors
))
.
setpwDesc
(
actDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
act_op
.
describe
());
// Create an Operation Graph. In this case it is convolution add bias activation
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
5
>
ops
=
{
&
conv_op
,
&
add_op
,
&
scale_op
,
&
bias_op
,
&
act_op
};
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
.
setHandle
(
handle_
)
.
setOperationGraph
(
ops
.
size
(),
ops
.
data
())
.
build
();
// Create string encoding for plan caching
auto
cache_string
=
getConvFusionString
(
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
dataType
,
opGraph
.
getTag
());
DEBUG_CUDNN_MSG
(
log_buf
,
"[convstring] "
<<
cache_string
);
auto
&
plan
=
getOrCreatePlan
(
handle_
,
log_buf
,
opGraph
,
cache_string
);
DEBUG_CUDNN_MSG
(
log_buf
,
"Plan tag: "
<<
plan
.
getTag
());
auto
workspace_size
=
plan
.
getWorkspaceSize
();
DEBUG_CUDNN_MSG
(
log_buf
,
plan
.
describe
()
<<
" requires workspace "
<<
workspace_size
);
void
*
workspace_ptr
=
nullptr
;
auto
workspace_tensor
=
at
::
empty
({(
workspace_size
+
3
)
/
4
},
at
::
TensorOptions
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
if
(
workspace_size
>
0
)
{
workspace_ptr
=
workspace_tensor
.
data_ptr
<
float
>
();
}
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
,
devPtrZ
,
devPtrB
,
devPtrI
};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
,
'z'
,
'b'
,
'i'
};
auto
variantPack
=
cudnn_frontend
::
VariantPackBuilder
()
.
setWorkspacePointer
(
workspace_ptr
)
.
setDataPointers
(
6
,
data_ptrs
)
.
setUids
(
6
,
uids
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
"variantPack "
<<
variantPack
.
describe
());
cudnnStatus_t
status
=
cudnnBackendExecute
(
handle_
,
plan
.
get_raw_desc
(),
variantPack
.
get_raw_desc
());
checkCudnnErr
(
status
);
cudnn_frontend
::
throw_if
([
status
]()
{
return
(
status
!=
CUDNN_STATUS_SUCCESS
);
},
"Plan execute error"
,
status
);
}
catch
(
cudnn_frontend
::
cudnnException
e
)
{
std
::
cout
<<
log_buf
.
str
()
<<
"[ERROR] Exception "
<<
e
.
what
()
<<
std
::
endl
;
}
}
void
run_conv_scale_bias_add_activation_mask
(
int64_t
*
x_dim_padded
,
int64_t
*
pad
,
int64_t
*
convstride
,
int64_t
*
dilation
,
int64_t
*
w_dim_padded
,
int64_t
*
y_dim_padded
,
int64_t
*
threshold_dim
,
cudnnDataType_t
dataType
,
at
::
Half
*
devPtrX
,
at
::
Half
*
devPtrW
,
at
::
Half
*
devPtrY
,
at
::
Half
*
devPtrZ
,
at
::
Half
*
devPtrB
,
at
::
Half
*
devPtrI
,
int
*
devPtrT
,
int
*
devPtrU
,
int
axis
)
{
cudnnHandle_t
handle_
=
torch
::
native
::
getCudnnHandle
();
std
::
stringstream
log_buf
;
try
{
int
convDim
=
2
;
// Creates the necessary tensor descriptors
masked_convbias_descriptors
tensors
=
create_conv_bias_add_act_mask_descriptors
(
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
y_dim_padded
,
threshold_dim
,
dataType
);
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
X_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
Y_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
W_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
Z_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
B_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTERADD_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTERBIAS_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTERCONV_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
OPTIONAL
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTERACT_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
GEN_INDEX_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
MASK_TOP_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
MASK_BOTTOM_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
MASK_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
THRESHOLD_TOP_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
THRESHOLD_BOTTOM_TENSOR
>
(
tensors
).
describe
());
// Define the add operation
auto
scaleDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_MUL
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
scaleDesc
.
describe
());
// Define the bias operation
auto
biasDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_ADD
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
biasDesc
.
describe
());
// optional add
auto
addDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_ADD
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
addDesc
.
describe
());
// Define the activation operation
auto
actDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_RELU_FWD
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
actDesc
.
describe
());
// Define the convolution problem
auto
convDesc
=
cudnn_frontend
::
ConvDescBuilder
()
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
setMathMode
(
CUDNN_CROSS_CORRELATION
)
.
setNDims
(
convDim
)
.
setStrides
(
convDim
,
convstride
)
.
setPrePadding
(
convDim
,
pad
)
.
setPostPadding
(
convDim
,
pad
)
.
setDilation
(
convDim
,
dilation
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
convDesc
.
describe
());
// Define the genIndex descriptor
auto
genIndexDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_GEN_INDEX
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
setAxis
(
axis
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
genIndexDesc
.
describe
());
// Define the lessThan descriptor
auto
lessThanDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_CMP_LT
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
lessThanDesc
.
describe
());
// Define the greaterThan descriptor
auto
greaterThanDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_CMP_GT
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
greaterThanDesc
.
describe
());
// Define the logical_or descriptor
auto
logicalOrDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_LOGICAL_OR
)
.
setMathPrecision
(
CUDNN_DATA_BOOLEAN
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
logicalOrDesc
.
describe
());
// Define the binary_selection descriptor
auto
selectionDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_BINARY_SELECT
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
selectionDesc
.
describe
());
float
alpha
=
1.0
f
;
float
beta
=
0.0
f
;
// Create a convolution Node
auto
conv_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR
)
.
setxDesc
(
std
::
get
<
X_TENSOR
>
(
tensors
))
.
setwDesc
(
std
::
get
<
W_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
AFTERCONV_TENSOR
>
(
tensors
))
.
setcDesc
(
convDesc
)
.
setAlpha
(
alpha
)
.
setBeta
(
beta
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
conv_op
.
describe
());
// Create a Add Node with scaling parameters.
auto
scale_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
conv_op
.
getOutputTensor
())
.
setbDesc
(
std
::
get
<
Z_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
AFTERADD_TENSOR
>
(
tensors
))
.
setpwDesc
(
scaleDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
scale_op
.
describe
());
// Create a Bias Node.
auto
bias_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
scale_op
.
getOutputTensor
())
.
setbDesc
(
std
::
get
<
B_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
AFTERBIAS_TENSOR
>
(
tensors
))
.
setpwDesc
(
biasDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
bias_op
.
describe
());
// Create a optional add Node.
auto
add_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
bias_op
.
getOutputTensor
())
.
setbDesc
(
std
::
get
<
OPTIONAL
>
(
tensors
))
.
setyDesc
(
std
::
get
<
AFTEROPT_TENSOR
>
(
tensors
))
.
setpwDesc
(
addDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
add_op
.
describe
());
// Create an Activation Node.
auto
act_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
devPtrI
?
add_op
.
getOutputTensor
()
:
bias_op
.
getOutputTensor
())
.
setyDesc
(
std
::
get
<
AFTERACT_TENSOR
>
(
tensors
))
.
setpwDesc
(
actDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
act_op
.
describe
());
// Create a Gen_Index Node.
auto
genIndex_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
std
::
get
<
AFTERACT_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
GEN_INDEX_TENSOR
>
(
tensors
))
.
setpwDesc
(
genIndexDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
genIndex_op
.
describe
());
// Create a LessThan Node.
auto
lessThan_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
std
::
get
<
GEN_INDEX_TENSOR
>
(
tensors
))
.
setbDesc
(
std
::
get
<
THRESHOLD_TOP_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
MASK_TOP_TENSOR
>
(
tensors
))
.
setpwDesc
(
lessThanDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
lessThan_op
.
describe
());
// Create a GreaterThan Node.
auto
greaterThan_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
std
::
get
<
GEN_INDEX_TENSOR
>
(
tensors
))
.
setbDesc
(
std
::
get
<
THRESHOLD_BOTTOM_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
MASK_BOTTOM_TENSOR
>
(
tensors
))
.
setpwDesc
(
greaterThanDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
greaterThan_op
.
describe
());
// Create a LogicalOr Node.
auto
logicalOr_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
std
::
get
<
MASK_TOP_TENSOR
>
(
tensors
))
.
setbDesc
(
std
::
get
<
MASK_BOTTOM_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
MASK_TENSOR
>
(
tensors
))
.
setpwDesc
(
logicalOrDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
logicalOr_op
.
describe
());
// Create a Binary_Selection Node.
auto
selection_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
std
::
get
<
AFTERCONV_TENSOR
>
(
tensors
))
.
setbDesc
(
std
::
get
<
AFTERACT_TENSOR
>
(
tensors
))
.
settDesc
(
std
::
get
<
MASK_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
Y_TENSOR
>
(
tensors
))
.
setpwDesc
(
selectionDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
selection_op
.
describe
());
// Create an Operation Graph. In this case it is convolution add bias activation
if
(
devPtrI
)
{
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
10
>
ops
=
{
&
conv_op
,
&
scale_op
,
&
bias_op
,
&
add_op
,
&
act_op
,
&
genIndex_op
,
&
lessThan_op
,
&
greaterThan_op
,
&
logicalOr_op
,
&
selection_op
};
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
.
setHandle
(
handle_
)
.
setOperationGraph
(
ops
.
size
(),
ops
.
data
())
.
build
();
// Create string encoding for plan caching
auto
cache_string
=
getConvFusionString
(
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
dataType
,
opGraph
.
getTag
());
DEBUG_CUDNN_MSG
(
log_buf
,
"[convstring] "
<<
cache_string
);
auto
&
plan
=
getOrCreatePlan
(
handle_
,
log_buf
,
opGraph
,
cache_string
);
DEBUG_CUDNN_MSG
(
log_buf
,
"Plan tag: "
<<
plan
.
getTag
());
auto
workspace_size
=
plan
.
getWorkspaceSize
();
DEBUG_CUDNN_MSG
(
log_buf
,
plan
.
describe
()
<<
" requires workspace "
<<
workspace_size
);
void
*
workspace_ptr
=
nullptr
;
auto
workspace_tensor
=
at
::
empty
({(
workspace_size
+
3
)
/
4
},
at
::
TensorOptions
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
if
(
workspace_size
>
0
)
{
workspace_ptr
=
workspace_tensor
.
data_ptr
<
float
>
();
}
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
,
devPtrZ
,
devPtrB
,
devPtrI
,
devPtrT
,
devPtrU
};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
,
'z'
,
'b'
,
'i'
,
't'
,
'u'
};
auto
variantPack
=
cudnn_frontend
::
VariantPackBuilder
()
.
setWorkspacePointer
(
workspace_ptr
)
.
setDataPointers
(
8
,
data_ptrs
)
.
setUids
(
8
,
uids
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
"variantPack "
<<
variantPack
.
describe
());
cudnnStatus_t
status
=
cudnnBackendExecute
(
handle_
,
plan
.
get_raw_desc
(),
variantPack
.
get_raw_desc
());
checkCudnnErr
(
status
);
cudnn_frontend
::
throw_if
([
status
]()
{
return
(
status
!=
CUDNN_STATUS_SUCCESS
);
},
"Plan execute error"
,
status
);
}
else
{
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
9
>
ops
=
{
&
conv_op
,
&
scale_op
,
&
bias_op
,
&
act_op
,
&
genIndex_op
,
&
lessThan_op
,
&
greaterThan_op
,
&
logicalOr_op
,
&
selection_op
};
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
.
setHandle
(
handle_
)
.
setOperationGraph
(
ops
.
size
(),
ops
.
data
())
.
build
();
// Create string encoding for plan caching
auto
cache_string
=
getConvFusionString
(
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
dataType
,
opGraph
.
getTag
());
DEBUG_CUDNN_MSG
(
log_buf
,
"[convstring] "
<<
cache_string
);
auto
&
plan
=
getOrCreatePlan
(
handle_
,
log_buf
,
opGraph
,
cache_string
);
DEBUG_CUDNN_MSG
(
log_buf
,
"Plan tag: "
<<
plan
.
getTag
());
auto
workspace_size
=
plan
.
getWorkspaceSize
();
DEBUG_CUDNN_MSG
(
log_buf
,
plan
.
describe
()
<<
" requires workspace "
<<
workspace_size
);
void
*
workspace_ptr
=
nullptr
;
auto
workspace_tensor
=
at
::
empty
({(
workspace_size
+
3
)
/
4
},
at
::
TensorOptions
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
if
(
workspace_size
>
0
)
{
workspace_ptr
=
workspace_tensor
.
data_ptr
<
float
>
();
}
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
,
devPtrZ
,
devPtrB
,
devPtrT
,
devPtrU
};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
,
'z'
,
'b'
,
't'
,
'u'
};
auto
variantPack
=
cudnn_frontend
::
VariantPackBuilder
()
.
setWorkspacePointer
(
workspace_ptr
)
.
setDataPointers
(
7
,
data_ptrs
)
.
setUids
(
7
,
uids
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
"variantPack "
<<
variantPack
.
describe
());
cudnnStatus_t
status
=
cudnnBackendExecute
(
handle_
,
plan
.
get_raw_desc
(),
variantPack
.
get_raw_desc
());
checkCudnnErr
(
status
);
cudnn_frontend
::
throw_if
([
status
]()
{
return
(
status
!=
CUDNN_STATUS_SUCCESS
);
},
"Plan execute error"
,
status
);
}
}
catch
(
cudnn_frontend
::
cudnnException
e
)
{
std
::
cout
<<
log_buf
.
str
()
<<
"[ERROR] Exception "
<<
e
.
what
()
<<
std
::
endl
;
}
}
void
run_dconv_add_drelu_dscale
(
int64_t
*
x_dim_padded
,
int64_t
*
pad
,
int64_t
*
convstride
,
int64_t
*
dilation
,
int64_t
*
w_dim_padded
,
int64_t
*
y_dim_padded
,
cudnnDataType_t
dataType
,
at
::
Half
*
devPtrX
,
at
::
Half
*
devPtrW
,
at
::
Half
*
devPtrY
,
at
::
Half
*
devPtrZ
,
at
::
Half
*
devPtrR
,
at
::
Half
*
devPtrI
)
{
cudnnHandle_t
handle_
=
torch
::
native
::
getCudnnHandle
();
cudnnHandle_t
handle_
=
torch
::
native
::
getCudnnHandle
();
std
::
stringstream
log_buf
;
std
::
stringstream
log_buf
;
try
{
try
{
int
convDim
=
2
;
int
convDim
=
2
;
// Creates the necessary tensor descriptors
// Creates the necessary tensor descriptors
dconv_descriptors
tensors
=
create_dconv_descriptors
(
dconv_
add_
descriptors
tensors
=
create_dconv_
add_
descriptors
(
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
y_dim_padded
,
dataType
);
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
y_dim_padded
,
dataType
);
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DY_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DY_TENSOR
>
(
tensors
).
describe
());
...
@@ -2201,6 +2568,8 @@ run_dconv_add(int64_t* x_dim_padded,
...
@@ -2201,6 +2568,8 @@ run_dconv_add(int64_t* x_dim_padded,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
RELU_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
RELU_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_INPUT_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_OPTIONAL_TENSOR
>
(
tensors
).
describe
());
// Define the convolution problem
// Define the convolution problem
auto
convDesc
=
cudnn_frontend
::
ConvDescBuilder
()
auto
convDesc
=
cudnn_frontend
::
ConvDescBuilder
()
...
@@ -2214,12 +2583,26 @@ run_dconv_add(int64_t* x_dim_padded,
...
@@ -2214,12 +2583,26 @@ run_dconv_add(int64_t* x_dim_padded,
.
build
();
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
convDesc
.
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
convDesc
.
describe
());
//
Define the add backward operation
//
optional add
auto
addDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
auto
addDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_ADD
)
.
setMode
(
CUDNN_POINTWISE_ADD
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
addDesc
.
describe
());
// Define the activation backward operation
auto
actDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_RELU_BWD
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
actDesc
.
describe
());
// Define the scale backward operation
auto
scaleDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_MUL
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
add
Desc
.
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
scale
Desc
.
describe
());
float
alpha
=
1.0
f
;
float
alpha
=
1.0
f
;
float
beta
=
0.0
f
;
float
beta
=
0.0
f
;
...
@@ -2235,564 +2618,292 @@ run_dconv_add(int64_t* x_dim_padded,
...
@@ -2235,564 +2618,292 @@ run_dconv_add(int64_t* x_dim_padded,
.
build
();
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
conv_op
.
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
conv_op
.
describe
());
// TODO: do we need getOutputTensor(), and what it returns in backward case?
// Create add Node.
// Create add Node.
auto
add_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
auto
add_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
))
.
setxDesc
(
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
))
.
setbDesc
(
std
::
get
<
RELU
_TENSOR
>
(
tensors
))
.
setbDesc
(
std
::
get
<
DGRAD_INPUT
_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
X_OR_DX
_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
DGRAD_OPTIONAL
_TENSOR
>
(
tensors
))
.
setpwDesc
(
addDesc
)
.
setpwDesc
(
addDesc
)
.
build
();
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
add_op
.
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
add_op
.
describe
());
// Create an Operation Graph. In this case it is convolution add bias activation
// TODO: do we need getOutputTensor(), and what it returns in backward case?
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
2
>
ops
=
{
&
conv_op
,
&
add_op
};
// Create an relu backward Node.
auto
act_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
.
setdyDesc
(
std
::
get
<
DGRAD_OPTIONAL_TENSOR
>
(
tensors
))
.
setHandle
(
handle_
)
.
setxDesc
(
std
::
get
<
RELU_TENSOR
>
(
tensors
))
.
setOperationGraph
(
ops
.
size
(),
ops
.
data
())
.
setdxDesc
(
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
))
.
setpwDesc
(
actDesc
)
.
build
();
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
act_op
.
describe
());
// Create string encoding for plan caching
// Create a Scale Node.
auto
cache_string
=
getConvFusionString
(
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
dataType
,
opGraph
.
getTag
());
auto
scale_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
DEBUG_CUDNN_MSG
(
log_buf
,
"[convstring] "
<<
cache_string
);
.
setxDesc
(
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
))
.
setbDesc
(
std
::
get
<
SCALE_TENSOR
>
(
tensors
))
auto
&
plan
=
getOrCreatePlan
(
handle_
,
log_buf
,
opGraph
,
cache_string
);
.
setyDesc
(
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
))
DEBUG_CUDNN_MSG
(
log_buf
,
"Plan tag: "
<<
plan
.
getTag
());
.
setpwDesc
(
scaleDesc
)
auto
workspace_size
=
plan
.
getWorkspaceSize
();
DEBUG_CUDNN_MSG
(
log_buf
,
plan
.
describe
()
<<
" requires workspace "
<<
workspace_size
);
void
*
workspace_ptr
=
nullptr
;
auto
workspace_tensor
=
at
::
empty
({(
workspace_size
+
3
)
/
4
},
at
::
TensorOptions
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
if
(
workspace_size
>
0
)
{
workspace_ptr
=
workspace_tensor
.
data_ptr
<
float
>
();
}
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
,
devPtrR
};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
,
'r'
};
auto
variantPack
=
cudnn_frontend
::
VariantPackBuilder
()
.
setWorkspacePointer
(
workspace_ptr
)
.
setDataPointers
(
4
,
data_ptrs
)
.
setUids
(
4
,
uids
)
.
build
();
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
"variantPack "
<<
variantPack
.
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
scale_op
.
describe
());
cudnnStatus_t
status
=
cudnnBackendExecute
(
handle_
,
plan
.
get_raw_desc
(),
variantPack
.
get_raw_desc
());
checkCudnnErr
(
status
);
cudnn_frontend
::
throw_if
([
status
]()
{
return
(
status
!=
CUDNN_STATUS_SUCCESS
);
},
"Plan execute error"
,
status
);
}
catch
(
cudnn_frontend
::
cudnnException
e
)
{
std
::
cout
<<
log_buf
.
str
()
<<
"[ERROR] Exception "
<<
e
.
what
()
<<
std
::
endl
;
}
}
// inputs contains x,w,z,b,(i)
std
::
vector
<
at
::
Tensor
>
bottleneck_forward
(
bool
explicit_nhwc
,
int
stride_1X1
,
std
::
vector
<
at
::
Tensor
>
inputs
)
{
std
::
cout
<<
std
::
fixed
;
// create output vector
std
::
vector
<
at
::
Tensor
>
outputs
;
auto
output_format
=
explicit_nhwc
?
at
::
MemoryFormat
::
Contiguous
:
at
::
MemoryFormat
::
ChannelsLast
;
// setup dimensions
int64_t
dimA
[]
=
{
0
,
0
,
0
,
0
};
int64_t
filterdimA1
[]
=
{
0
,
0
,
0
,
0
};
int64_t
filterdimA2
[]
=
{
0
,
0
,
0
,
0
};
int64_t
filterdimA3
[]
=
{
0
,
0
,
0
,
0
};
int64_t
filterdimA4
[]
=
{
0
,
0
,
0
,
0
};
// All dim calculation after this order of n,c,h,w
int
axis
[]
{
0
,
1
,
2
,
3
};
if
(
explicit_nhwc
)
{
axis
[
0
]
=
0
;
axis
[
1
]
=
3
;
axis
[
2
]
=
1
;
axis
[
3
]
=
2
;
}
for
(
int
dim
=
0
;
dim
<
4
;
dim
++
)
{
dimA
[
dim
]
=
inputs
[
0
].
size
(
axis
[
dim
]);
filterdimA1
[
dim
]
=
inputs
[
1
].
size
(
axis
[
dim
]);
filterdimA2
[
dim
]
=
inputs
[
2
].
size
(
axis
[
dim
]);
filterdimA3
[
dim
]
=
inputs
[
3
].
size
(
axis
[
dim
]);
}
if
(
stride_1X1
!=
1
||
filterdimA3
[
0
]
!=
dimA
[
1
])
{
for
(
int
dim
=
0
;
dim
<
4
;
dim
++
)
{
filterdimA4
[
dim
]
=
inputs
[
10
].
size
(
axis
[
dim
]);
}
}
// output dim in n,c,h,w used by backend
int64_t
outdimA1
[]
=
{
0
,
0
,
0
,
0
};
// Computed Below
int64_t
outdimA2
[]
=
{
0
,
0
,
0
,
0
};
// Computed Below
int64_t
outdimA3
[]
=
{
0
,
0
,
0
,
0
};
// Computed Below
// use these fixed value for test run
int64_t
padA
[]
=
{
0
,
0
};
int64_t
padA1
[]
=
{
1
,
1
};
int64_t
dilationA
[]
=
{
1
,
1
};
int64_t
convstrideA
[]
=
{
1
,
1
};
int64_t
convstride1X1
[]
=
{
stride_1X1
,
stride_1X1
};
// compute output from pad/stride/dilation
outdimA1
[
0
]
=
dimA
[
0
];
outdimA1
[
1
]
=
filterdimA1
[
0
];
for
(
int
dim
=
0
;
dim
<
2
;
dim
++
)
{
outdimA1
[
dim
+
2
]
=
getFwdConvOutputDim
(
dimA
[
dim
+
2
],
padA
[
dim
],
filterdimA1
[
dim
+
2
],
convstride1X1
[
dim
],
dilationA
[
dim
]);
}
outdimA2
[
0
]
=
outdimA1
[
0
];
outdimA2
[
1
]
=
filterdimA2
[
0
];
for
(
int
dim
=
0
;
dim
<
2
;
dim
++
)
{
outdimA2
[
dim
+
2
]
=
getFwdConvOutputDim
(
outdimA1
[
dim
+
2
],
padA1
[
dim
],
filterdimA2
[
dim
+
2
],
convstrideA
[
dim
],
dilationA
[
dim
]);
}
outdimA3
[
0
]
=
outdimA2
[
0
];
outdimA3
[
1
]
=
filterdimA3
[
0
];
for
(
int
dim
=
0
;
dim
<
2
;
dim
++
)
{
outdimA3
[
dim
+
2
]
=
getFwdConvOutputDim
(
outdimA2
[
dim
+
2
],
padA
[
dim
],
filterdimA3
[
dim
+
2
],
convstrideA
[
dim
],
dilationA
[
dim
]);
}
// Create output tensor in the correct shape in pytorch's view
int64_t
outdim1
[]
=
{
0
,
0
,
0
,
0
};
int64_t
outdim2
[]
=
{
0
,
0
,
0
,
0
};
int64_t
outdim3
[]
=
{
0
,
0
,
0
,
0
};
if
(
explicit_nhwc
)
{
axis
[
0
]
=
0
;
axis
[
1
]
=
2
;
axis
[
2
]
=
3
;
axis
[
3
]
=
1
;
}
for
(
int
dim
=
0
;
dim
<
4
;
dim
++
)
{
outdim1
[
dim
]
=
outdimA1
[
axis
[
dim
]];
outdim2
[
dim
]
=
outdimA2
[
axis
[
dim
]];
outdim3
[
dim
]
=
outdimA3
[
axis
[
dim
]];
}
// run
at
::
Half
*
x
=
inputs
[
0
].
data_ptr
<
at
::
Half
>
();
at
::
Half
*
w
=
inputs
[
1
].
data_ptr
<
at
::
Half
>
();
at
::
Half
*
z
=
inputs
[
4
].
data_ptr
<
at
::
Half
>
();
at
::
Half
*
b
=
inputs
[
7
].
data_ptr
<
at
::
Half
>
();
auto
out1
=
at
::
empty
(
outdim1
,
inputs
[
0
].
type
(),
output_format
);
at
::
Half
*
y1
=
out1
.
data_ptr
<
at
::
Half
>
();
run_conv_scale_bias_add_activation
(
dimA
,
padA
,
convstride1X1
,
dilationA
,
filterdimA1
,
outdimA1
,
CUDNN_DATA_HALF
,
x
,
w
,
y1
,
z
,
b
,
nullptr
);
DEBUG_MSG
(
"[DEBUG] new relu1 : "
<<
out1
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
w
=
inputs
[
2
].
data_ptr
<
at
::
Half
>
();
z
=
inputs
[
5
].
data_ptr
<
at
::
Half
>
();
b
=
inputs
[
8
].
data_ptr
<
at
::
Half
>
();
auto
out2
=
at
::
empty
(
outdim2
,
inputs
[
0
].
type
(),
output_format
);
at
::
Half
*
y2
=
out2
.
data_ptr
<
at
::
Half
>
();
run_conv_scale_bias_add_activation
(
outdimA1
,
padA1
,
convstrideA
,
dilationA
,
filterdimA2
,
outdimA2
,
CUDNN_DATA_HALF
,
y1
,
w
,
y2
,
z
,
b
,
nullptr
);
DEBUG_MSG
(
"[DEBUG] new relu2 : "
<<
out2
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
// create output of conv3
auto
out3
=
at
::
empty
(
outdim3
,
inputs
[
0
].
type
(),
output_format
);
at
::
Half
*
y3
=
out3
.
data_ptr
<
at
::
Half
>
();
// create output of conv4 that may exist
auto
identity
=
at
::
empty_like
(
out3
);
at
::
Half
*
yi
=
identity
.
data_ptr
<
at
::
Half
>
();
if
(
stride_1X1
!=
1
||
filterdimA3
[
0
]
!=
dimA
[
1
]){
w
=
inputs
[
10
].
data_ptr
<
at
::
Half
>
();
z
=
inputs
[
11
].
data_ptr
<
at
::
Half
>
();
b
=
inputs
[
12
].
data_ptr
<
at
::
Half
>
();
run_conv_scale_bias
(
dimA
,
padA
,
convstride1X1
,
dilationA
,
filterdimA4
,
outdimA3
,
CUDNN_DATA_HALF
,
x
,
w
,
yi
,
z
,
b
);
DEBUG_MSG
(
"[DEBUG] new downsample : "
<<
identity
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
}
else
{
yi
=
x
;
}
w
=
inputs
[
3
].
data_ptr
<
at
::
Half
>
();
z
=
inputs
[
6
].
data_ptr
<
at
::
Half
>
();
b
=
inputs
[
9
].
data_ptr
<
at
::
Half
>
();
run_conv_scale_bias_add_activation
(
outdimA2
,
padA
,
convstrideA
,
dilationA
,
filterdimA3
,
outdimA3
,
CUDNN_DATA_HALF
,
y2
,
w
,
y3
,
z
,
b
,
yi
);
DEBUG_MSG
(
"[DEBUG] new relu3 : "
<<
out3
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
outputs
.
push_back
(
out1
);
outputs
.
push_back
(
out2
);
outputs
.
push_back
(
out3
);
return
outputs
;
}
std
::
vector
<
at
::
Tensor
>
bottleneck_backward
(
bool
explicit_nhwc
,
int
stride_1X1
,
std
::
vector
<
at
::
Tensor
>
inputs
)
{
bool
requires_grad
=
inputs
[
0
].
requires_grad
();
std
::
cout
<<
std
::
fixed
;
// create output vector
std
::
vector
<
at
::
Tensor
>
outputs
;
auto
output_format
=
explicit_nhwc
?
at
::
MemoryFormat
::
Contiguous
:
at
::
MemoryFormat
::
ChannelsLast
;
// setup dimensions
// Create an Operation Graph. In this case it is convolution add bias activation
int64_t
dimA
[]
=
{
0
,
0
,
0
,
0
};
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
4
>
ops
=
{
&
conv_op
,
&
add_op
,
&
act_op
,
&
scale_op
};
int64_t
filterdimA1
[]
=
{
0
,
0
,
0
,
0
};
int64_t
filterdimA2
[]
=
{
0
,
0
,
0
,
0
};
int64_t
filterdimA3
[]
=
{
0
,
0
,
0
,
0
};
int64_t
filterdimA4
[]
=
{
0
,
0
,
0
,
0
};
// All dim calculation after this order of n,c,h,w
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
int
axis
[]
{
0
,
1
,
2
,
3
};
.
setHandle
(
handle_
)
if
(
explicit_nhwc
)
{
.
setOperationGraph
(
ops
.
size
(),
ops
.
data
())
axis
[
0
]
=
0
;
.
build
();
axis
[
1
]
=
3
;
axis
[
2
]
=
1
;
axis
[
3
]
=
2
;
}
for
(
int
dim
=
0
;
dim
<
4
;
dim
++
)
{
dimA
[
dim
]
=
inputs
[
0
].
size
(
axis
[
dim
]);
filterdimA1
[
dim
]
=
inputs
[
1
].
size
(
axis
[
dim
]);
filterdimA2
[
dim
]
=
inputs
[
2
].
size
(
axis
[
dim
]);
filterdimA3
[
dim
]
=
inputs
[
3
].
size
(
axis
[
dim
]);
}
if
(
stride_1X1
!=
1
||
filterdimA3
[
0
]
!=
dimA
[
1
])
{
for
(
int
dim
=
0
;
dim
<
4
;
dim
++
)
{
filterdimA4
[
dim
]
=
inputs
[
14
].
size
(
axis
[
dim
]);
}
}
// output dim in n,c,h,w used by backend
// Create string encoding for plan caching
int64_t
outdimA1
[]
=
{
0
,
0
,
0
,
0
};
// Computed Below
auto
cache_string
=
getConvFusionString
(
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
dataType
,
opGraph
.
getTag
());
int64_t
outdimA2
[]
=
{
0
,
0
,
0
,
0
};
// Computed Below
DEBUG_CUDNN_MSG
(
log_buf
,
"[convstring] "
<<
cache_string
);
int64_t
outdimA3
[]
=
{
0
,
0
,
0
,
0
};
// Computed Below
// use these fixed value for test run
auto
&
plan
=
getOrCreatePlan
(
handle_
,
log_buf
,
opGraph
,
cache_string
);
int64_t
padA
[]
=
{
0
,
0
};
DEBUG_CUDNN_MSG
(
log_buf
,
"Plan tag: "
<<
plan
.
getTag
());
int64_t
padA1
[]
=
{
1
,
1
};
int64_t
dilationA
[]
=
{
1
,
1
};
int64_t
convstrideA
[]
=
{
1
,
1
};
int64_t
convstride1X1
[]
=
{
stride_1X1
,
stride_1X1
};
// compute output from pad/stride/dilation
auto
workspace_size
=
plan
.
getWorkspaceSize
();
outdimA1
[
0
]
=
dimA
[
0
];
DEBUG_CUDNN_MSG
(
log_buf
,
plan
.
describe
()
<<
" requires workspace "
<<
workspace_size
);
outdimA1
[
1
]
=
filterdimA1
[
0
];
for
(
int
dim
=
0
;
dim
<
2
;
dim
++
)
{
outdimA1
[
dim
+
2
]
=
getFwdConvOutputDim
(
dimA
[
dim
+
2
],
padA
[
dim
],
filterdimA1
[
dim
+
2
],
convstride1X1
[
dim
],
dilationA
[
dim
]);
}
outdimA2
[
0
]
=
outdimA1
[
0
];
void
*
workspace_ptr
=
nullptr
;
outdimA2
[
1
]
=
filterdimA2
[
0
];
auto
workspace_tensor
=
at
::
empty
({(
workspace_size
+
3
)
/
4
},
at
::
TensorOptions
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
for
(
int
dim
=
0
;
dim
<
2
;
dim
++
)
{
if
(
workspace_size
>
0
)
{
outdimA2
[
dim
+
2
]
=
getFwdConvOutputDim
(
outdimA1
[
dim
+
2
],
padA1
[
dim
],
filterdimA2
[
dim
+
2
],
convstrideA
[
dim
],
dilationA
[
dim
]);
workspace_ptr
=
workspace_tensor
.
data_ptr
<
float
>
();
}
}
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
,
devPtrZ
,
devPtrR
,
devPtrI
};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
,
's'
,
'r'
,
'i'
};
auto
variantPack
=
cudnn_frontend
::
VariantPackBuilder
()
.
setWorkspacePointer
(
workspace_ptr
)
.
setDataPointers
(
6
,
data_ptrs
)
.
setUids
(
6
,
uids
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
"variantPack "
<<
variantPack
.
describe
());
cudnnStatus_t
status
=
cudnnBackendExecute
(
handle_
,
plan
.
get_raw_desc
(),
variantPack
.
get_raw_desc
());
checkCudnnErr
(
status
);
cudnn_frontend
::
throw_if
([
status
]()
{
return
(
status
!=
CUDNN_STATUS_SUCCESS
);
},
"Plan execute error"
,
status
);
}
catch
(
cudnn_frontend
::
cudnnException
e
)
{
std
::
cout
<<
log_buf
.
str
()
<<
"[ERROR] Exception "
<<
e
.
what
()
<<
std
::
endl
;
}
}
outdimA3
[
0
]
=
outdimA2
[
0
];
void
outdimA3
[
1
]
=
filterdimA3
[
0
];
run_dconv_drelu_dscale_mask
(
int64_t
*
x_dim_padded
,
for
(
int
dim
=
0
;
dim
<
2
;
dim
++
)
{
int64_t
*
pad
,
outdimA3
[
dim
+
2
]
=
getFwdConvOutputDim
(
outdimA2
[
dim
+
2
],
padA
[
dim
],
filterdimA3
[
dim
+
2
],
convstrideA
[
dim
],
dilationA
[
dim
]);
int64_t
*
convstride
,
}
int64_t
*
dilation
,
int64_t
*
w_dim_padded
,
int64_t
*
y_dim_padded
,
int64_t
*
threshold_dim
,
cudnnDataType_t
dataType
,
at
::
Half
*
devPtrX
,
at
::
Half
*
devPtrW
,
at
::
Half
*
devPtrY
,
at
::
Half
*
devPtrZ
,
at
::
Half
*
devPtrR
,
int
*
devPtrT
,
int
*
devPtrU
,
int
axis
)
{
cudnnHandle_t
handle_
=
torch
::
native
::
getCudnnHandle
();
std
::
stringstream
log_buf
;
try
{
int
convDim
=
2
;
// Create output tensor in the correct shape in pytorch's view
// Creates the necessary tensor descriptors
int64_t
outdim1
[]
=
{
0
,
0
,
0
,
0
};
dconv_mask_descriptors
tensors
=
create_dconv_mask_descriptors
(
int64_t
outdim2
[]
=
{
0
,
0
,
0
,
0
};
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
y_dim_padded
,
threshold_dim
,
dataType
);
int64_t
outdim3
[]
=
{
0
,
0
,
0
,
0
};
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
).
describe
());
if
(
explicit_nhwc
)
{
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DY_TENSOR
>
(
tensors
).
describe
());
axis
[
0
]
=
0
;
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
W_OR_DW_TENSOR
>
(
tensors
).
describe
());
axis
[
1
]
=
2
;
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
SCALE_TENSOR
>
(
tensors
).
describe
());
axis
[
2
]
=
3
;
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
RELU_TENSOR
>
(
tensors
).
describe
());
axis
[
3
]
=
1
;
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
).
describe
());
}
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
).
describe
());
for
(
int
dim
=
0
;
dim
<
4
;
dim
++
)
{
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_OPTIONAL_TENSOR
>
(
tensors
).
describe
());
outdim1
[
dim
]
=
outdimA1
[
axis
[
dim
]];
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_GEN_INDEX_TENSOR
>
(
tensors
).
describe
());
outdim2
[
dim
]
=
outdimA2
[
axis
[
dim
]];
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_MASK_TOP_TENSOR
>
(
tensors
).
describe
());
outdim3
[
dim
]
=
outdimA3
[
axis
[
dim
]];
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_MASK_BOTTOM_TENSOR
>
(
tensors
).
describe
());
}
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_MASK_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_THRESHOLD_TOP_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_THRESHOLD_BOTTOM_TENSOR
>
(
tensors
).
describe
());
// dconv3+drelu2+dscale2
// Define the convolution problem
at
::
Half
*
conv_in
=
inputs
[
13
].
data_ptr
<
at
::
Half
>
();
auto
convDesc
=
cudnn_frontend
::
ConvDescBuilder
()
at
::
Half
*
dy3
=
inputs
[
10
].
data_ptr
<
at
::
Half
>
();
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
setMathMode
(
CUDNN_CROSS_CORRELATION
)
.
setNDims
(
convDim
)
.
setStrides
(
convDim
,
convstride
)
.
setPrePadding
(
convDim
,
pad
)
.
setPostPadding
(
convDim
,
pad
)
.
setDilation
(
convDim
,
dilation
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
convDesc
.
describe
());
DEBUG_MSG
(
"[DEBUG] new dconv3 : "
<<
inputs
[
10
].
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
// Define the activation backward operation
auto
actDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_RELU_BWD
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
actDesc
.
describe
());
// wgrad
// Define the scale backward operation
auto
wgrad3
=
at
::
empty_like
(
inputs
[
3
]);
auto
scaleDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
at
::
Half
*
dw3
=
wgrad3
.
data_ptr
<
at
::
Half
>
();
.
setMode
(
CUDNN_POINTWISE_MUL
)
run_dconv
(
outdimA2
,
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
padA
,
.
build
();
convstrideA
,
DEBUG_CUDNN_MSG
(
log_buf
,
scaleDesc
.
describe
());
dilationA
,
filterdimA3
,
outdimA3
,
CUDNN_DATA_HALF
,
conv_in
,
dw3
,
dy3
,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
);
// dgrad
// Define the genIndex descriptor
auto
grad_out2
=
at
::
empty
(
outdim2
,
inputs
[
0
].
type
(),
output_format
);
auto
genIndexDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
at
::
Half
*
dy2
=
grad_out2
.
data_ptr
<
at
::
Half
>
();
.
setMode
(
CUDNN_POINTWISE_GEN_INDEX
)
at
::
Half
*
w
=
inputs
[
3
].
data_ptr
<
at
::
Half
>
();
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
at
::
Half
*
z
=
inputs
[
5
].
data_ptr
<
at
::
Half
>
();
.
setAxis
(
axis
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
genIndexDesc
.
describe
());
at
::
Half
*
relu2
=
inputs
[
13
].
data_ptr
<
at
::
Half
>
();
// Define the lessThan descriptor
auto
lessThanDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_CMP_LT
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
lessThanDesc
.
describe
());
run_dconv_drelu_dscale
(
outdimA2
,
// Define the greaterThan descriptor
padA
,
auto
greaterThanDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
convstrideA
,
.
setMode
(
CUDNN_POINTWISE_CMP_GT
)
dilationA
,
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
filterdimA3
,
.
build
();
outdimA3
,
DEBUG_CUDNN_MSG
(
log_buf
,
greaterThanDesc
.
describe
());
CUDNN_DATA_HALF
,
dy2
,
w
,
dy3
,
z
,
relu2
);
DEBUG_MSG
(
"[DEBUG] new dconv2 : "
<<
grad_out2
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
// Define the logical_or descriptor
auto
logicalOrDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_LOGICAL_OR
)
.
setMathPrecision
(
CUDNN_DATA_BOOLEAN
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
logicalOrDesc
.
describe
());
// dconv2+drelu1+dscale1
// Define the binary_selection descriptor
conv_in
=
inputs
[
12
].
data_ptr
<
at
::
Half
>
();
auto
selectionDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_BINARY_SELECT
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
selectionDesc
.
describe
());
// wgrad
float
alpha
=
1.0
f
;
auto
wgrad2
=
at
::
empty_like
(
inputs
[
2
]);
float
beta
=
0.0
f
;
at
::
Half
*
dw2
=
wgrad2
.
data_ptr
<
at
::
Half
>
();
run_dconv
(
outdimA1
,
// Create a convolution Node
padA1
,
auto
conv_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
)
convstrideA
,
.
setdxDesc
(
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
))
dilationA
,
.
setwDesc
(
std
::
get
<
W_OR_DW_TENSOR
>
(
tensors
))
filterdimA2
,
.
setdyDesc
(
std
::
get
<
DY_TENSOR
>
(
tensors
))
outdimA2
,
.
setcDesc
(
convDesc
)
CUDNN_DATA_HALF
,
.
setAlpha
(
alpha
)
conv_in
,
.
setBeta
(
beta
)
dw2
,
.
build
();
dy2
,
DEBUG_CUDNN_MSG
(
log_buf
,
conv_op
.
describe
());
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
);
// dgrad
// TODO: do we need getOutputTensor(), and what it returns in backward case?
auto
grad_out1
=
at
::
empty
(
outdim1
,
inputs
[
0
].
type
(),
output_format
);
// Create an relu backward Node.
at
::
Half
*
dy1
=
grad_out1
.
data_ptr
<
at
::
Half
>
();
auto
act_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
w
=
inputs
[
2
].
data_ptr
<
at
::
Half
>
();
.
setdyDesc
(
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
))
z
=
inputs
[
4
].
data_ptr
<
at
::
Half
>
();
.
setxDesc
(
std
::
get
<
RELU_TENSOR
>
(
tensors
))
.
setdxDesc
(
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
))
.
setpwDesc
(
actDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
act_op
.
describe
());
at
::
Half
*
relu1
=
inputs
[
12
].
data_ptr
<
at
::
Half
>
();
// Create a Scale Node.
// fused dgrad
auto
scale_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
run_dconv_drelu_dscale
(
outdimA1
,
.
setxDesc
(
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
))
padA1
,
.
setbDesc
(
std
::
get
<
SCALE_TENSOR
>
(
tensors
))
convstrideA
,
.
setyDesc
(
std
::
get
<
DGRAD_OPTIONAL_TENSOR
>
(
tensors
))
dilationA
,
.
setpwDesc
(
scaleDesc
)
filterdimA2
,
.
build
();
outdimA2
,
DEBUG_CUDNN_MSG
(
log_buf
,
scale_op
.
describe
());
CUDNN_DATA_HALF
,
dy1
,
w
,
dy2
,
z
,
relu1
);
/*
// Create a Gen_Index Node.
// backward strided conv cannot be fused
auto
genIndex_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
// if stride == 1 but channel changes, we can fuse here
.
setxDesc
(
std
::
get
<
DGRAD_OPTIONAL_TENSOR
>
(
tensors
))
if (stride_1X1 != 1){
.
setyDesc
(
std
::
get
<
DGRAD_GEN_INDEX_TENSOR
>
(
tensors
))
// dgrad
.
setpwDesc
(
genIndexDesc
)
run_dconv(outdimA1,
.
build
();
padA1,
DEBUG_CUDNN_MSG
(
log_buf
,
genIndex_op
.
describe
());
convstride1X1,
dilationA,
filterdimA2,
outdimA2,
CUDNN_DATA_HALF,
dy1,
w,
dy2,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);
// mul fused mask
// Create a LessThan Node.
grad_out1.mul_(inputs[15]);
auto
lessThan_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
}
.
setxDesc
(
std
::
get
<
DGRAD_GEN_INDEX_TENSOR
>
(
tensors
))
else {
.
setbDesc
(
std
::
get
<
DGRAD_THRESHOLD_TOP_TENSOR
>
(
tensors
))
at::Half* relu1 = inputs[12].data_ptr<at::Half>();
.
setyDesc
(
std
::
get
<
DGRAD_MASK_TOP_TENSOR
>
(
tensors
))
// fused dgrad
.
setpwDesc
(
lessThanDesc
)
run_dconv_drelu_dscale(outdimA1,
.
build
();
padA1,
DEBUG_CUDNN_MSG
(
log_buf
,
lessThan_op
.
describe
());
convstride1X1,
dilationA,
filterdimA2,
outdimA2,
CUDNN_DATA_HALF,
dy1,
w,
dy2,
z,
relu1);
}
*/
DEBUG_MSG
(
"[DEBUG] new dconv1 : "
<<
grad_out1
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
// create grads of conv4 that may exist
// Create a GreaterThan Node.
auto
grad_x_conv4
=
at
::
empty_like
(
inputs
[
0
]);
auto
greaterThan_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
at
::
Half
*
dx_conv4
=
grad_x_conv4
.
data_ptr
<
at
::
Half
>
();
.
setxDesc
(
std
::
get
<
DGRAD_GEN_INDEX_TENSOR
>
(
tensors
))
at
::
Tensor
wgrad4
;
.
setbDesc
(
std
::
get
<
DGRAD_THRESHOLD_BOTTOM_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
DGRAD_MASK_BOTTOM_TENSOR
>
(
tensors
))
.
setpwDesc
(
greaterThanDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
greaterThan_op
.
describe
());
// x used for dconv1 and dconv4 wgrad
// Create a LogicalOr Node.
at
::
Half
*
x
=
inputs
[
0
].
data_ptr
<
at
::
Half
>
();
auto
logicalOr_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
std
::
get
<
DGRAD_MASK_TOP_TENSOR
>
(
tensors
))
.
setbDesc
(
std
::
get
<
DGRAD_MASK_BOTTOM_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
DGRAD_MASK_TENSOR
>
(
tensors
))
.
setpwDesc
(
logicalOrDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
logicalOr_op
.
describe
());
if
(
stride_1X1
!=
1
||
filterdimA3
[
0
]
!=
dimA
[
1
]){
// Create a Binary_Selection Node.
w
=
inputs
[
14
].
data_ptr
<
at
::
Half
>
();
auto
selection_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
at
::
Half
*
dy_conv4
=
inputs
[
11
].
data_ptr
<
at
::
Half
>
();
.
setxDesc
(
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
))
if
(
requires_grad
)
{
.
setbDesc
(
std
::
get
<
DGRAD_OPTIONAL_TENSOR
>
(
tensors
))
run_dconv
(
dimA
,
.
settDesc
(
std
::
get
<
DGRAD_MASK_TENSOR
>
(
tensors
))
padA
,
.
setyDesc
(
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
))
convstride1X1
,
.
setpwDesc
(
selectionDesc
)
dilationA
,
.
build
();
filterdimA4
,
DEBUG_CUDNN_MSG
(
log_buf
,
selection_op
.
describe
());
outdimA3
,
CUDNN_DATA_HALF
,
dx_conv4
,
w
,
dy_conv4
,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
);
// we don't print here since we can't hook out this grad in pytorch alone to compare, due to addition with dx
// DEBUG_MSG("[DEBUG] new dx_identity : " << grad_x_conv4.to(at::kFloat).sum().item<float>());
}
// wgrad
wgrad4
=
at
::
empty_like
(
inputs
[
14
]);
at
::
Half
*
dw4
=
wgrad4
.
data_ptr
<
at
::
Half
>
();
run_dconv
(
dimA
,
padA
,
convstride1X1
,
dilationA
,
filterdimA4
,
outdimA3
,
CUDNN_DATA_HALF
,
x
,
dw4
,
dy_conv4
,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
);
}
else
{
// if there is no downsample, dx_conv4 is fork of drelu3
dx_conv4
=
inputs
[
11
].
data_ptr
<
at
::
Half
>
();
}
// dconv1+add
// Create an Operation Graph. In this case it is convolution add bias activation
// wgrad
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
8
>
ops
=
{
&
conv_op
,
&
act_op
,
&
scale_op
,
&
genIndex_op
,
&
lessThan_op
,
&
greaterThan_op
,
&
logicalOr_op
,
&
selection_op
};
auto
wgrad1
=
at
::
empty_like
(
inputs
[
1
]);
at
::
Half
*
dw1
=
wgrad1
.
data_ptr
<
at
::
Half
>
();
run_dconv
(
dimA
,
padA
,
convstride1X1
,
dilationA
,
filterdimA1
,
outdimA1
,
CUDNN_DATA_HALF
,
x
,
dw1
,
dy1
,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
);
// dgrad
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
w
=
inputs
[
1
].
data_ptr
<
at
::
Half
>
();
.
setHandle
(
handle_
)
auto
grad_x
=
at
::
empty_like
(
inputs
[
0
]);
.
setOperationGraph
(
ops
.
size
(),
ops
.
data
())
at
::
Half
*
dx
=
grad_x
.
data_ptr
<
at
::
Half
>
();
.
build
();
// backward strided conv cannot be fused
// Create string encoding for plan caching
// if stride == 1 but channel changes, we can fuse here
auto
cache_string
=
getConvFusionString
(
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
dataType
,
opGraph
.
getTag
());
if
(
requires_grad
){
DEBUG_CUDNN_MSG
(
log_buf
,
"[convstring] "
<<
cache_string
);
if
(
stride_1X1
!=
1
){
run_dconv
(
dimA
,
padA
,
convstride1X1
,
dilationA
,
filterdimA1
,
outdimA1
,
CUDNN_DATA_HALF
,
dx
,
w
,
dy1
,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
);
// add 2 together
grad_x
.
add_
(
grad_x_conv4
);
}
else
{
run_dconv_add
(
dimA
,
padA
,
convstride1X1
,
dilationA
,
filterdimA1
,
outdimA1
,
CUDNN_DATA_HALF
,
dx
,
w
,
dy1
,
dx_conv4
);
}
}
DEBUG_MSG
(
"[DEBUG] new dx : "
<<
grad_x
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
auto
&
plan
=
getOrCreatePlan
(
handle_
,
log_buf
,
opGraph
,
cache_string
);
DEBUG_MSG
(
"[DEBUG] new wgrad1 : "
<<
wgrad1
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
DEBUG_CUDNN_MSG
(
log_buf
,
"Plan tag: "
<<
plan
.
getTag
());
DEBUG_MSG
(
"[DEBUG] new wgrad2 : "
<<
wgrad2
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
DEBUG_MSG
(
"[DEBUG] new wgrad3 : "
<<
wgrad3
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
outputs
.
push_back
(
grad_x
);
outputs
.
push_back
(
wgrad1
);
outputs
.
push_back
(
wgrad2
);
outputs
.
push_back
(
wgrad3
);
if
(
stride_1X1
!=
1
||
filterdimA3
[
0
]
!=
dimA
[
1
])
{
auto
workspace_size
=
plan
.
getWorkspaceSize
();
DEBUG_MSG
(
"[DEBUG] new wgrad4 : "
<<
wgrad4
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
DEBUG_CUDNN_MSG
(
log_buf
,
plan
.
describe
()
<<
" requires workspace "
<<
workspace_size
);
outputs
.
push_back
(
wgrad4
);
}
return
outputs
;
void
*
workspace_ptr
=
nullptr
;
auto
workspace_tensor
=
at
::
empty
({(
workspace_size
+
3
)
/
4
},
at
::
TensorOptions
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
if
(
workspace_size
>
0
)
{
workspace_ptr
=
workspace_tensor
.
data_ptr
<
float
>
();
}
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
,
devPtrZ
,
devPtrR
,
devPtrT
,
devPtrU
};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
,
's'
,
'r'
,
't'
,
'u'
};
auto
variantPack
=
cudnn_frontend
::
VariantPackBuilder
()
.
setWorkspacePointer
(
workspace_ptr
)
.
setDataPointers
(
7
,
data_ptrs
)
.
setUids
(
7
,
uids
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
"variantPack "
<<
variantPack
.
describe
());
cudnnStatus_t
status
=
cudnnBackendExecute
(
handle_
,
plan
.
get_raw_desc
(),
variantPack
.
get_raw_desc
());
checkCudnnErr
(
status
);
cudnn_frontend
::
throw_if
([
status
]()
{
return
(
status
!=
CUDNN_STATUS_SUCCESS
);
},
"Plan execute error"
,
status
);
}
catch
(
cudnn_frontend
::
cudnnException
e
)
{
std
::
cout
<<
log_buf
.
str
()
<<
"[ERROR] Exception "
<<
e
.
what
()
<<
std
::
endl
;
}
}
}
namespace
{
struct
bottleneck_forward_status
{
struct
bottleneck_forward_status
{
int64_t
dimA
[
4
];
int64_t
dimA
[
4
];
...
...
apex/contrib/csrc/peer_memory/peer_memory_cuda.cu
View file @
fa8e7d99
...
@@ -388,7 +388,7 @@ void push_pull_halos_1d(
...
@@ -388,7 +388,7 @@ void push_pull_halos_1d(
const
int
numThreads
=
128
;
const
int
numThreads
=
128
;
dim3
block
(
numThreads
,
1
,
1
);
dim3
block
(
numThreads
,
1
,
1
);
AT_DISPATCH_ALL_TYPES_AND
(
at
::
ScalarType
::
Half
,
top_out_halo
.
scalar_type
(),
"push_pull_halos_1d_kernel"
,
[
&
]{
AT_DISPATCH_ALL_TYPES_AND
(
at
::
ScalarType
::
Half
,
top_out_halo
.
scalar_type
(),
"push_pull_halos_1d_kernel"
,
[
&
]{
if
(
diagnostics
)
printf
(
"size(scalar_t) = %d
\n
"
,
sizeof
(
scalar_t
));
if
(
diagnostics
)
printf
(
"size(scalar_t) = %
l
d
\n
"
,
sizeof
(
scalar_t
));
scalar_t
*
toh_p
=
top_out_halo
.
data_ptr
<
scalar_t
>
();
scalar_t
*
toh_p
=
top_out_halo
.
data_ptr
<
scalar_t
>
();
scalar_t
*
tox_p
=
top_out_tx
.
data_ptr
<
scalar_t
>
();
scalar_t
*
tox_p
=
top_out_tx
.
data_ptr
<
scalar_t
>
();
scalar_t
*
tix_p
=
top_inp_tx
.
data_ptr
<
scalar_t
>
();
scalar_t
*
tix_p
=
top_inp_tx
.
data_ptr
<
scalar_t
>
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment