Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
apex
Commits
fa8e7d99
Commit
fa8e7d99
authored
Apr 03, 2022
by
Thor Johnsen
Browse files
Clean up code
parent
05dd9c69
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
1884 additions
and
1773 deletions
+1884
-1773
apex/contrib/csrc/bottleneck/bottleneck.cpp
apex/contrib/csrc/bottleneck/bottleneck.cpp
+1883
-1772
apex/contrib/csrc/peer_memory/peer_memory_cuda.cu
apex/contrib/csrc/peer_memory/peer_memory_cuda.cu
+1
-1
No files found.
apex/contrib/csrc/bottleneck/bottleneck.cpp
View file @
fa8e7d99
...
@@ -102,13 +102,6 @@ enum {
...
@@ -102,13 +102,6 @@ enum {
AFTERCONV_TENSOR,
AFTERCONV_TENSOR,
OPTIONAL,
OPTIONAL,
AFTEROPT_TENSOR,
AFTEROPT_TENSOR,
AFTERACT_TENSOR
,
GEN_INDEX_TENSOR
,
MASK_TOP_TENSOR
,
MASK_BOTTOM_TENSOR
,
MASK_TENSOR
,
THRESHOLD_TOP_TENSOR
,
THRESHOLD_BOTTOM_TENSOR
,
};
};
using common_conv_descriptors =
using common_conv_descriptors =
...
@@ -180,11 +173,11 @@ using common_convbias_descriptors = std::tuple<cudnn_frontend::Tensor,
...
@@ -180,11 +173,11 @@ using common_convbias_descriptors = std::tuple<cudnn_frontend::Tensor,
common_convbias_descriptors
common_convbias_descriptors
create_conv_bias_add_act_descriptors(int64_t* x_dim_padded,
create_conv_bias_add_act_descriptors(int64_t* x_dim_padded,
int64_t* padA,
int64_t* padA,
int64_t
*
convstrideA
,
int64_t* convstrideA,
int64_t
*
dilationA
,
int64_t* dilationA,
int64_t
*
w_dim_padded
,
int64_t* w_dim_padded,
int64_t
*
y_dim_padded
,
int64_t* y_dim_padded,
cudnnDataType_t
dataType
)
{
cudnnDataType_t dataType) {
const int convDim = 2;
const int convDim = 2;
int64_t b_dim_padded[4];
int64_t b_dim_padded[4];
...
@@ -279,183 +272,6 @@ create_conv_bias_add_act_descriptors(int64_t* x_dim_padded,
...
@@ -279,183 +272,6 @@ create_conv_bias_add_act_descriptors(int64_t* x_dim_padded,
.build());
.build());
}
}
using
masked_convbias_descriptors
=
std
::
tuple
<
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
>
;
masked_convbias_descriptors
create_conv_bias_add_act_mask_descriptors
(
int64_t
*
x_dim_padded
,
int64_t
*
padA
,
int64_t
*
convstrideA
,
int64_t
*
dilationA
,
int64_t
*
w_dim_padded
,
int64_t
*
y_dim_padded
,
int64_t
*
threshold_dim
,
cudnnDataType_t
dataType
)
{
const
int
convDim
=
2
;
int64_t
b_dim_padded
[
4
];
b_dim_padded
[
0
]
=
1
;
b_dim_padded
[
1
]
=
y_dim_padded
[
1
];
b_dim_padded
[
2
]
=
1
;
b_dim_padded
[
3
]
=
1
;
int64_t
x_stride_padded
[
4
];
int64_t
y_stride_padded
[
4
];
int64_t
w_stride_padded
[
4
];
int64_t
b_stride_padded
[
4
];
int64_t
threshold_stride
[
4
];
generateStrides
(
w_dim_padded
,
w_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
x_dim_padded
,
x_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
y_dim_padded
,
y_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
b_dim_padded
,
b_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
threshold_dim
,
threshold_stride
,
4
,
CUDNN_TENSOR_NHWC
);
return
masked_convbias_descriptors
(
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
x_dim_padded
)
.
setStrides
(
4
,
x_stride_padded
)
.
setId
(
'x'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'y'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
w_dim_padded
)
.
setStrides
(
4
,
w_stride_padded
)
.
setId
(
'w'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
b_dim_padded
)
.
setStrides
(
4
,
b_stride_padded
)
.
setId
(
'z'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
b_dim_padded
)
.
setStrides
(
4
,
b_stride_padded
)
.
setId
(
'b'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setVirtual
()
.
setId
(
'A'
)
// after add
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setVirtual
()
.
setId
(
'B'
)
// after bias
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'C'
)
// after conv
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'i'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'D'
)
// after optional add
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'E'
)
// after act for masked
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'I'
)
// output of the gen index operation
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_INT32
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'm'
)
// top half of the mask created after the less than
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_BOOLEAN
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'n'
)
// bottom half of the mask
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_BOOLEAN
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'M'
)
// OR of the top and bottom masks
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_BOOLEAN
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
threshold_dim
)
.
setStrides
(
4
,
threshold_stride
)
.
setId
(
't'
)
// threshold for creating the top mask
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_INT32
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
threshold_dim
)
.
setStrides
(
4
,
threshold_stride
)
.
setId
(
'u'
)
// threshold for creating the bottom mask
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_INT32
)
.
build
());
}
// tensor descriptors used for dgrad
// tensor descriptors used for dgrad
enum {
enum {
X_OR_DX_TENSOR,
X_OR_DX_TENSOR,
...
@@ -465,14 +281,6 @@ enum {
...
@@ -465,14 +281,6 @@ enum {
RELU_TENSOR,
RELU_TENSOR,
AFTER_DCONV_TENSOR,
AFTER_DCONV_TENSOR,
AFTER_DRELU_TENSOR,
AFTER_DRELU_TENSOR,
DGRAD_INPUT_TENSOR
,
DGRAD_OPTIONAL_TENSOR
,
DGRAD_GEN_INDEX_TENSOR
,
DGRAD_MASK_TOP_TENSOR
,
DGRAD_MASK_BOTTOM_TENSOR
,
DGRAD_MASK_TENSOR
,
DGRAD_THRESHOLD_TOP_TENSOR
,
DGRAD_THRESHOLD_BOTTOM_TENSOR
,
};
};
using dconv_descriptors = std::tuple<cudnn_frontend::Tensor,
using dconv_descriptors = std::tuple<cudnn_frontend::Tensor,
...
@@ -481,8 +289,6 @@ using dconv_descriptors = std::tuple<cudnn_frontend::Tensor,
...
@@ -481,8 +289,6 @@ using dconv_descriptors = std::tuple<cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend::Tensor>;
cudnn_frontend::Tensor>;
dconv_descriptors
dconv_descriptors
...
@@ -561,194 +367,20 @@ create_dconv_descriptors(int64_t* x_dim_padded,
...
@@ -561,194 +367,20 @@ create_dconv_descriptors(int64_t* x_dim_padded,
.setId('B') // after drelu
.setId('B') // after drelu
.setAlignment(16)
.setAlignment(16)
.setDataType(CUDNN_DATA_FLOAT)
.setDataType(CUDNN_DATA_FLOAT)
.
build
(),
.build());
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'i'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'D'
)
// after optional add
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
());
}
}
using
dconv_mask_descriptors
=
std
::
tuple
<
cudnn_frontend
::
Tensor
,
// create a cache for plan
cudnn_frontend
::
Tensor
,
std::unordered_map<std::string, cudnn_frontend::ExecutionPlan> plan_cache;
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
,
cudnn_frontend
::
Tensor
>
;
dconv_mask_descriptors
create_dconv_mask_descriptors
(
int64_t
*
x_dim_padded
,
int64_t
*
padA
,
int64_t
*
convstrideA
,
int64_t
*
dilationA
,
int64_t
*
w_dim_padded
,
int64_t
*
y_dim_padded
,
int64_t
*
threshold_dim
,
cudnnDataType_t
dataType
)
{
const
int
convDim
=
2
;
int64_t
b_dim_padded
[
4
];
// TODO: better name
b_dim_padded
[
0
]
=
1
;
std::string getConvFusionString(int64_t* x_dim_padded,
b_dim_padded
[
1
]
=
x_dim_padded
[
1
];
int64_t* padA,
b_dim_padded
[
2
]
=
1
;
int64_t* convstrideA,
b_dim_padded
[
3
]
=
1
;
int64_t* dilationA,
int64_t* w_dim_padded,
int64_t
x_stride_padded
[
4
];
cudnnDataType_t dataType,
int64_t
y_stride_padded
[
4
];
std::string fusion_string) {
int64_t
w_stride_padded
[
4
];
int64_t
b_stride_padded
[
4
];
int64_t
threshold_stride
[
4
];
generateStrides
(
w_dim_padded
,
w_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
x_dim_padded
,
x_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
y_dim_padded
,
y_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
b_dim_padded
,
b_stride_padded
,
4
,
CUDNN_TENSOR_NHWC
);
generateStrides
(
threshold_dim
,
threshold_stride
,
4
,
CUDNN_TENSOR_NHWC
);
return
dconv_mask_descriptors
(
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
x_dim_padded
)
.
setStrides
(
4
,
x_stride_padded
)
.
setId
(
'x'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'y'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
w_dim_padded
)
.
setStrides
(
4
,
w_stride_padded
)
.
setId
(
'w'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
b_dim_padded
)
.
setStrides
(
4
,
b_stride_padded
)
.
setId
(
's'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
x_dim_padded
)
.
setStrides
(
4
,
x_stride_padded
)
.
setId
(
'r'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
x_dim_padded
)
.
setStrides
(
4
,
x_stride_padded
)
.
setVirtual
()
.
setId
(
'A'
)
// after dconv
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
x_dim_padded
)
.
setStrides
(
4
,
x_stride_padded
)
.
setVirtual
()
.
setId
(
'B'
)
// after drelu
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'i'
)
.
setAlignment
(
16
)
.
setDataType
(
dataType
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'D'
)
// after optional add
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'I'
)
// output of the gen index operation
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_INT32
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'm'
)
// top half of the mask created after the less than
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_BOOLEAN
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'n'
)
// bottom half of the mask
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_BOOLEAN
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
y_dim_padded
)
.
setStrides
(
4
,
y_stride_padded
)
.
setId
(
'M'
)
// OR of the top and bottom masks
.
setAlignment
(
16
)
.
setVirtual
()
.
setDataType
(
CUDNN_DATA_BOOLEAN
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
threshold_dim
)
.
setStrides
(
4
,
threshold_stride
)
.
setId
(
't'
)
// threshold for creating the top mask
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_INT32
)
.
build
(),
cudnn_frontend
::
TensorBuilder
()
.
setDim
(
4
,
threshold_dim
)
.
setStrides
(
4
,
threshold_stride
)
.
setId
(
'u'
)
// threshold for creating the bottom mask
.
setAlignment
(
16
)
.
setDataType
(
CUDNN_DATA_INT32
)
.
build
());
}
// create a cache for plan
std
::
unordered_map
<
std
::
string
,
cudnn_frontend
::
ExecutionPlan
>
plan_cache
;
// TODO: better name
std
::
string
getConvFusionString
(
int64_t
*
x_dim_padded
,
int64_t
*
padA
,
int64_t
*
convstrideA
,
int64_t
*
dilationA
,
int64_t
*
w_dim_padded
,
cudnnDataType_t
dataType
,
std
::
string
fusion_string
)
{
for(int i=0;i<4;i++) {
for(int i=0;i<4;i++) {
fusion_string += 'X';
fusion_string += 'X';
...
@@ -961,7 +593,7 @@ run_conv_scale_bias_add_activation(int64_t* x_dim_padded,
...
@@ -961,7 +593,7 @@ run_conv_scale_bias_add_activation(int64_t* x_dim_padded,
auto opGraph = cudnn_frontend::OperationGraphBuilder()
auto opGraph = cudnn_frontend::OperationGraphBuilder()
.setHandle(handle_)
.setHandle(handle_)
.
setOperationGraph
(
devPtrI
?
ops
.
size
()
:
ops
.
size
()
-
1
,
ops
.
data
())
.setOperationGraph(devPtrI ? ops.size() :
4
, ops.data())
.build();
.build();
// Create string encoding for plan caching
// Create string encoding for plan caching
...
@@ -996,19 +628,18 @@ run_conv_scale_bias_add_activation(int64_t* x_dim_padded,
...
@@ -996,19 +628,18 @@ run_conv_scale_bias_add_activation(int64_t* x_dim_padded,
}
}
void
void
run_conv_add_scale_bias_activation
(
int64_t
*
x_dim_padded
,
run_conv_scale_bias(int64_t* x_dim_padded,
int64_t
*
pad
,
int64_t* pad,
int64_t
*
convstride
,
int64_t* convstride,
int64_t
*
dilation
,
int64_t* dilation,
int64_t
*
w_dim_padded
,
int64_t* w_dim_padded,
int64_t
*
y_dim_padded
,
int64_t* y_dim_padded,
cudnnDataType_t
dataType
,
cudnnDataType_t dataType,
at
::
Half
*
devPtrX
,
at::Half* devPtrX,
at
::
Half
*
devPtrW
,
at::Half* devPtrW,
at
::
Half
*
devPtrY
,
at::Half* devPtrY,
at
::
Half
*
devPtrZ
,
at::Half* devPtrZ,
at
::
Half
*
devPtrB
,
at::Half* devPtrB) {
at
::
Half
*
devPtrI
)
{
cudnnHandle_t handle_ = torch::native::getCudnnHandle();
cudnnHandle_t handle_ = torch::native::getCudnnHandle();
std::stringstream log_buf;
std::stringstream log_buf;
try {
try {
...
@@ -1026,36 +657,21 @@ run_conv_add_scale_bias_activation(int64_t* x_dim_padded,
...
@@ -1026,36 +657,21 @@ run_conv_add_scale_bias_activation(int64_t* x_dim_padded,
DEBUG_CUDNN_MSG(log_buf, std::get<AFTERBIAS_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<AFTERBIAS_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<AFTERCONV_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<AFTERCONV_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<OPTIONAL>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<OPTIONAL>(tensors).describe());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTEROPT_TENSOR
>
(
tensors
).
describe
());
// Define the add operation
// Define the add operation
auto scaleDesc = cudnn_frontend::PointWiseDescBuilder()
auto scaleDesc = cudnn_frontend::PointWiseDescBuilder()
.
setMode
(
CUDNN_POINTWISE_MUL
)
.setMode(CUDNN_POINTWISE_MUL)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.setMathPrecision(CUDNN_DATA_FLOAT)
.
build
();
.build();
DEBUG_CUDNN_MSG(log_buf, scaleDesc.describe());
DEBUG_CUDNN_MSG(log_buf, scaleDesc.describe());
// Define the bias operation
// Define the bias operation
auto
biasDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_ADD
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
biasDesc
.
describe
());
// optional add
auto addDesc = cudnn_frontend::PointWiseDescBuilder()
auto addDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_ADD)
.setMode(CUDNN_POINTWISE_ADD)
.setMathPrecision(CUDNN_DATA_FLOAT)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
.build();
DEBUG_CUDNN_MSG(log_buf, addDesc.describe());
DEBUG_CUDNN_MSG(log_buf, addDesc.describe());
// Define the activation operation
auto
actDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_RELU_FWD
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
actDesc
.
describe
());
// Define the convolution problem
// Define the convolution problem
auto convDesc = cudnn_frontend::ConvDescBuilder()
auto convDesc = cudnn_frontend::ConvDescBuilder()
.setDataType(CUDNN_DATA_FLOAT)
.setDataType(CUDNN_DATA_FLOAT)
...
@@ -1082,43 +698,26 @@ run_conv_add_scale_bias_activation(int64_t* x_dim_padded,
...
@@ -1082,43 +698,26 @@ run_conv_add_scale_bias_activation(int64_t* x_dim_padded,
.build();
.build();
DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
// create an add node.
auto
add_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
conv_op
.
getOutputTensor
())
.
setbDesc
(
std
::
get
<
OPTIONAL
>
(
tensors
))
.
setyDesc
(
std
::
get
<
AFTEROPT_TENSOR
>
(
tensors
))
.
setpwDesc
(
addDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
add_op
.
describe
());
// Create a Add Node with scaling parameters.
// Create a Add Node with scaling parameters.
auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.
setxDesc
(
add
_op
.
getOutputTensor
())
.setxDesc(
conv
_op.getOutputTensor())
.
setbDesc
(
std
::
get
<
Z_TENSOR
>
(
tensors
))
.setbDesc(std::get<Z_TENSOR>(tensors))
.
setyDesc
(
std
::
get
<
AFTERADD_TENSOR
>
(
tensors
))
.setyDesc(std::get<AFTERADD_TENSOR>(tensors))
// TODO: change enum to aftermul
.
setpwDesc
(
scaleDesc
)
.setpwDesc(scaleDesc)
.
build
();
.build();
DEBUG_CUDNN_MSG(log_buf, scale_op.describe());
DEBUG_CUDNN_MSG(log_buf, scale_op.describe());
// Create a Bias Node.
// Create a Bias Node.
auto
bias_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.
setxDesc
(
scale_op
.
getOutputTensor
())
.setxDesc(scale_op.getOutputTensor())
.
setbDesc
(
std
::
get
<
B_TENSOR
>
(
tensors
))
.setbDesc(std::get<B_TENSOR>(tensors))
.
setyDesc
(
std
::
get
<
AFTERBIAS_TENSOR
>
(
tensors
))
.setyDesc(std::get<Y_TENSOR>(tensors))
.
setpwDesc
(
biasDesc
)
.setpwDesc(addDesc)
.
build
();
.build();
DEBUG_CUDNN_MSG
(
log_buf
,
bias_op
.
describe
());
DEBUG_CUDNN_MSG(log_buf, add_op.describe());
// Create an Activation Node.
auto
act_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
bias_op
.
getOutputTensor
())
.
setyDesc
(
std
::
get
<
Y_TENSOR
>
(
tensors
))
.
setpwDesc
(
actDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
act_op
.
describe
());
// Create an Operation Graph. In this case it is convolution add bias activation
// Create an Operation Graph. In this case it is convolution add bias activation
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
5
>
ops
=
{
&
conv_op
,
&
add_op
,
&
scale_op
,
&
bias_op
,
&
act
_op
};
std::array<cudnn_frontend::Operation const*,
3
> ops = {&conv_op, &scale_op, &
add
_op};
auto opGraph = cudnn_frontend::OperationGraphBuilder()
auto opGraph = cudnn_frontend::OperationGraphBuilder()
.setHandle(handle_)
.setHandle(handle_)
...
@@ -1140,12 +739,12 @@ run_conv_add_scale_bias_activation(int64_t* x_dim_padded,
...
@@ -1140,12 +739,12 @@ run_conv_add_scale_bias_activation(int64_t* x_dim_padded,
if (workspace_size > 0) {
if (workspace_size > 0) {
workspace_ptr = workspace_tensor.data_ptr<float>();
workspace_ptr = workspace_tensor.data_ptr<float>();
}
}
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
,
devPtrZ
,
devPtrB
,
devPtrI
};
void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrZ, devPtrB};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
,
'z'
,
'b'
,
'i'
};
int64_t uids[] = {'x', 'y', 'w', 'z', 'b'};
auto variantPack = cudnn_frontend::VariantPackBuilder()
auto variantPack = cudnn_frontend::VariantPackBuilder()
.setWorkspacePointer(workspace_ptr)
.setWorkspacePointer(workspace_ptr)
.
setDataPointers
(
6
,
data_ptrs
)
.setDataPointers(
5
, data_ptrs)
.
setUids
(
6
,
uids
)
.setUids(
5
, uids)
.build();
.build();
DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
...
@@ -1156,76 +755,35 @@ run_conv_add_scale_bias_activation(int64_t* x_dim_padded,
...
@@ -1156,76 +755,35 @@ run_conv_add_scale_bias_activation(int64_t* x_dim_padded,
}
}
}
}
void
run_conv_scale_bias_add_activation_mask
(
int64_t
*
x_dim_padded
,
int64_t
*
pad
,
int64_t
*
convstride
,
int64_t
*
dilation
,
int64_t
*
w_dim_padded
,
int64_t
*
y_dim_padded
,
int64_t
*
threshold_dim
,
cudnnDataType_t
dataType
,
at
::
Half
*
devPtrX
,
at
::
Half
*
devPtrW
,
at
::
Half
*
devPtrY
,
at
::
Half
*
devPtrZ
,
at
::
Half
*
devPtrB
,
at
::
Half
*
devPtrI
,
int
*
devPtrT
,
int
*
devPtrU
,
int
axis
)
{
cudnnHandle_t
handle_
=
torch
::
native
::
getCudnnHandle
();
std
::
stringstream
log_buf
;
try
{
int
convDim
=
2
;
// Creates the necessary tensor descriptors
void
masked_convbias_descriptors
tensors
=
create_conv_bias_add_act_mask_descriptors
(
run_dconv_drelu_dscale(int64_t* x_dim_padded,
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
y_dim_padded
,
threshold_dim
,
dataType
);
int64_t* pad,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
X_TENSOR
>
(
tensors
).
describe
());
int64_t* convstride,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
Y_TENSOR
>
(
tensors
).
describe
());
int64_t* dilation,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
W_TENSOR
>
(
tensors
).
describe
());
int64_t* w_dim_padded,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
Z_TENSOR
>
(
tensors
).
describe
());
int64_t* y_dim_padded,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
B_TENSOR
>
(
tensors
).
describe
());
cudnnDataType_t dataType,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTERADD_TENSOR
>
(
tensors
).
describe
());
at::Half* devPtrX,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTERBIAS_TENSOR
>
(
tensors
).
describe
());
at::Half* devPtrW,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTERCONV_TENSOR
>
(
tensors
).
describe
());
at::Half* devPtrY,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
OPTIONAL
>
(
tensors
).
describe
());
at::Half* devPtrZ,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTERACT_TENSOR
>
(
tensors
).
describe
());
at::Half* devPtrR) {
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
GEN_INDEX_TENSOR
>
(
tensors
).
describe
());
cudnnHandle_t handle_ = torch::native::getCudnnHandle();
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
MASK_TOP_TENSOR
>
(
tensors
).
describe
());
std::stringstream log_buf;
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
MASK_BOTTOM_TENSOR
>
(
tensors
).
describe
());
try {
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
MASK_TENSOR
>
(
tensors
).
describe
());
int convDim = 2;
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
THRESHOLD_TOP_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
THRESHOLD_BOTTOM_TENSOR
>
(
tensors
).
describe
());
// Define the add operation
auto
scaleDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_MUL
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
scaleDesc
.
describe
());
// Define the bias operation
auto
biasDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_ADD
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
biasDesc
.
describe
());
// optional add
auto
addDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_ADD
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
addDesc
.
describe
());
// Define the activation operation
// Creates the necessary tensor descriptors
auto
actDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
dconv_descriptors tensors = create_dconv_descriptors(
.
setMode
(
CUDNN_POINTWISE_RELU_FWD
)
x_dim_padded, pad, convstride, dilation, w_dim_padded, y_dim_padded, dataType);
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
DEBUG_CUDNN_MSG(log_buf, std::get<X_OR_DX_TENSOR>(tensors).describe());
.
build
();
DEBUG_CUDNN_MSG(log_buf, std::get<DY_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG
(
log_buf
,
actDesc
.
describe
());
DEBUG_CUDNN_MSG(log_buf, std::get<W_OR_DW_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<SCALE_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<RELU_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<AFTER_DCONV_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<AFTER_DRELU_TENSOR>(tensors).describe());
// Define the convolution problem
// Define the convolution problem
auto convDesc = cudnn_frontend::ConvDescBuilder()
auto convDesc = cudnn_frontend::ConvDescBuilder()
...
@@ -1239,258 +797,119 @@ run_conv_scale_bias_add_activation_mask(int64_t* x_dim_padded,
...
@@ -1239,258 +797,119 @@ run_conv_scale_bias_add_activation_mask(int64_t* x_dim_padded,
.build();
.build();
DEBUG_CUDNN_MSG(log_buf, convDesc.describe());
DEBUG_CUDNN_MSG(log_buf, convDesc.describe());
// Define the genIndex descriptor
// Define the activation backward operation
auto
genIndexDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
auto actDesc = cudnn_frontend::PointWiseDescBuilder()
.
setMode
(
CUDNN_POINTWISE_GEN_INDEX
)
.setMode(CUDNN_POINTWISE_RELU_BWD)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.setMathPrecision(CUDNN_DATA_FLOAT)
.
setAxis
(
axis
)
.build();
.
build
();
DEBUG_CUDNN_MSG(log_buf, actDesc.describe());
DEBUG_CUDNN_MSG
(
log_buf
,
genIndexDesc
.
describe
());
// Define the lessThan descriptor
auto
lessThanDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_CMP_LT
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
lessThanDesc
.
describe
());
// Define the greaterThan descriptor
auto
greaterThanDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_CMP_GT
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
greaterThanDesc
.
describe
());
// Define the logical_or descriptor
auto
logicalOrDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_LOGICAL_OR
)
.
setMathPrecision
(
CUDNN_DATA_BOOLEAN
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
logicalOrDesc
.
describe
());
// Define the
binary_selection descriptor
// Define the
scale backward operation
auto
s
election
Desc
=
cudnn_frontend
::
PointWiseDescBuilder
()
auto s
cale
Desc = cudnn_frontend::PointWiseDescBuilder()
.
setMode
(
CUDNN_POINTWISE_
BINARY_SELECT
)
.setMode(CUDNN_POINTWISE_
MUL
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.setMathPrecision(CUDNN_DATA_FLOAT)
.
build
();
.build();
DEBUG_CUDNN_MSG
(
log_buf
,
s
election
Desc
.
describe
());
DEBUG_CUDNN_MSG(log_buf, s
cale
Desc.describe());
float alpha = 1.0f;
float alpha = 1.0f;
float beta = 0.0f;
float beta = 0.0f;
// Create a convolution Node
// Create a convolution Node
auto
conv_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_CONVOLUTION_
FORWARD
_DESCRIPTOR
)
auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_
BACKWARD_DATA
_DESCRIPTOR)
.
setxDesc
(
std
::
get
<
X
_TENSOR
>
(
tensors
))
.set
d
xDesc(std::get<
AFTER_DCONV
_TENSOR>(tensors))
.
setwDesc
(
std
::
get
<
W_TENSOR
>
(
tensors
))
.setwDesc(std::get<W_
OR_DW_
TENSOR>(tensors))
.
setyDesc
(
std
::
get
<
AFTERCONV
_TENSOR
>
(
tensors
))
.set
d
yDesc(std::get<
DY
_TENSOR>(tensors))
.
setcDesc
(
convDesc
)
.setcDesc(convDesc)
.
setAlpha
(
alpha
)
.setAlpha(alpha)
.
setBeta
(
beta
)
.setBeta(beta)
.
build
();
.build();
DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
// Create a Add Node with scaling parameters.
// TODO: do we need getOutputTensor(), and what it returns in backward case?
auto
scale_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
// Create an relu backward Node.
.
setxDesc
(
conv_op
.
getOutputTensor
())
.
setbDesc
(
std
::
get
<
Z_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
AFTERADD_TENSOR
>
(
tensors
))
.
setpwDesc
(
scaleDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
scale_op
.
describe
());
// Create a Bias Node.
auto
bias_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
scale_op
.
getOutputTensor
())
.
setbDesc
(
std
::
get
<
B_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
AFTERBIAS_TENSOR
>
(
tensors
))
.
setpwDesc
(
biasDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
bias_op
.
describe
());
// Create a optional add Node.
auto
add_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
bias_op
.
getOutputTensor
())
.
setbDesc
(
std
::
get
<
OPTIONAL
>
(
tensors
))
.
setyDesc
(
std
::
get
<
AFTEROPT_TENSOR
>
(
tensors
))
.
setpwDesc
(
addDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
add_op
.
describe
());
// Create an Activation Node.
auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.
setxDesc
(
devPtrI
?
add_op
.
getOutputTensor
()
:
bias_op
.
getOutputTensor
())
.setdyDesc(std::get<AFTER_DCONV_TENSOR>(tensors))
.
setyDesc
(
std
::
get
<
AFTERACT_TENSOR
>
(
tensors
))
.setxDesc(std::get<RELU_TENSOR>(tensors))
.
setpwDesc
(
actDesc
)
.setdxDesc(std::get<AFTER_DRELU_TENSOR>(tensors))
.
build
();
.setpwDesc(actDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, act_op.describe());
DEBUG_CUDNN_MSG(log_buf, act_op.describe());
// Create a Gen_Index Node.
// Create a Scale Node.
auto
genIndex_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.
setxDesc
(
std
::
get
<
AFTERACT_TENSOR
>
(
tensors
))
.setxDesc(std::get<AFTER_DRELU_TENSOR>(tensors))
.
setyDesc
(
std
::
get
<
GEN_INDEX_TENSOR
>
(
tensors
))
.setbDesc(std::get<SCALE_TENSOR>(tensors))
.
setpwDesc
(
genIndexDesc
)
.setyDesc(std::get<X_OR_DX_TENSOR>(tensors))
.
build
();
.setpwDesc(scaleDesc)
DEBUG_CUDNN_MSG
(
log_buf
,
genIndex_op
.
describe
());
.build();
DEBUG_CUDNN_MSG(log_buf, scale_op.describe());
// Create a LessThan Node.
auto
lessThan_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
std
::
get
<
GEN_INDEX_TENSOR
>
(
tensors
))
.
setbDesc
(
std
::
get
<
THRESHOLD_TOP_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
MASK_TOP_TENSOR
>
(
tensors
))
.
setpwDesc
(
lessThanDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
lessThan_op
.
describe
());
// Create a GreaterThan Node.
auto
greaterThan_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
std
::
get
<
GEN_INDEX_TENSOR
>
(
tensors
))
.
setbDesc
(
std
::
get
<
THRESHOLD_BOTTOM_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
MASK_BOTTOM_TENSOR
>
(
tensors
))
.
setpwDesc
(
greaterThanDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
greaterThan_op
.
describe
());
// Create a LogicalOr Node.
auto
logicalOr_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
std
::
get
<
MASK_TOP_TENSOR
>
(
tensors
))
.
setbDesc
(
std
::
get
<
MASK_BOTTOM_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
MASK_TENSOR
>
(
tensors
))
.
setpwDesc
(
logicalOrDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
logicalOr_op
.
describe
());
// Create a Binary_Selection Node.
auto
selection_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
std
::
get
<
AFTERCONV_TENSOR
>
(
tensors
))
.
setbDesc
(
std
::
get
<
AFTERACT_TENSOR
>
(
tensors
))
.
settDesc
(
std
::
get
<
MASK_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
Y_TENSOR
>
(
tensors
))
.
setpwDesc
(
selectionDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
selection_op
.
describe
());
// Create an Operation Graph. In this case it is convolution add bias activation
// Create an Operation Graph. In this case it is convolution add bias activation
if
(
devPtrI
)
{
std::array<cudnn_frontend::Operation const*, 3> ops = {&conv_op, &act_op, &scale_op};
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
10
>
ops
=
{
&
conv_op
,
&
scale_op
,
&
bias_op
,
&
add_op
,
&
act_op
,
&
genIndex_op
,
&
lessThan_op
,
&
greaterThan_op
,
&
logicalOr_op
,
&
selection_op
};
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
auto opGraph = cudnn_frontend::OperationGraphBuilder()
.
setHandle
(
handle_
)
.setHandle(handle_)
.
setOperationGraph
(
ops
.
size
(),
ops
.
data
())
.setOperationGraph(ops.size(), ops.data())
.
build
();
.build();
// Create string encoding for plan caching
// Create string encoding for plan caching
auto
cache_string
=
getConvFusionString
(
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
dataType
,
opGraph
.
getTag
());
auto cache_string = getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
DEBUG_CUDNN_MSG
(
log_buf
,
"[convstring] "
<<
cache_string
);
DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);
auto
&
plan
=
getOrCreatePlan
(
handle_
,
log_buf
,
opGraph
,
cache_string
);
auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
DEBUG_CUDNN_MSG
(
log_buf
,
"Plan tag: "
<<
plan
.
getTag
());
DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());
auto
workspace_size
=
plan
.
getWorkspaceSize
();
auto workspace_size = plan.getWorkspaceSize();
DEBUG_CUDNN_MSG
(
log_buf
,
plan
.
describe
()
<<
" requires workspace "
<<
workspace_size
);
DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);
void
*
workspace_ptr
=
nullptr
;
void* workspace_ptr = nullptr;
auto
workspace_tensor
=
at
::
empty
({(
workspace_size
+
3
)
/
4
},
at
::
TensorOptions
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
if
(
workspace_size
>
0
)
{
if (workspace_size > 0) {
workspace_ptr
=
workspace_tensor
.
data_ptr
<
float
>
();
workspace_ptr = workspace_tensor.data_ptr<float>();
}
}
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
,
devPtrZ
,
devPtrB
,
devPtrI
,
devPtrT
,
devPtrU
};
void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrZ, devPtrR};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
,
'z'
,
'b'
,
'i'
,
't'
,
'u'
};
int64_t uids[] = {'x', 'y', 'w', 's', 'r'};
auto
variantPack
=
cudnn_frontend
::
VariantPackBuilder
()
auto variantPack = cudnn_frontend::VariantPackBuilder()
.
setWorkspacePointer
(
workspace_ptr
)
.setWorkspacePointer(workspace_ptr)
.
setDataPointers
(
8
,
data_ptrs
)
.setDataPointers(5, data_ptrs)
.
setUids
(
8
,
uids
)
.setUids(5, uids)
.
build
();
.build();
DEBUG_CUDNN_MSG
(
log_buf
,
"variantPack "
<<
variantPack
.
describe
());
DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
cudnnStatus_t
status
=
cudnnBackendExecute
(
handle_
,
plan
.
get_raw_desc
(),
variantPack
.
get_raw_desc
());
cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
checkCudnnErr
(
status
);
checkCudnnErr(status);
cudnn_frontend
::
throw_if
([
status
]()
{
return
(
status
!=
CUDNN_STATUS_SUCCESS
);
},
"Plan execute error"
,
status
);
cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
}
else
{
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
9
>
ops
=
{
&
conv_op
,
&
scale_op
,
&
bias_op
,
&
act_op
,
&
genIndex_op
,
&
lessThan_op
,
&
greaterThan_op
,
&
logicalOr_op
,
&
selection_op
};
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
.
setHandle
(
handle_
)
.
setOperationGraph
(
ops
.
size
(),
ops
.
data
())
.
build
();
// Create string encoding for plan caching
auto
cache_string
=
getConvFusionString
(
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
dataType
,
opGraph
.
getTag
());
DEBUG_CUDNN_MSG
(
log_buf
,
"[convstring] "
<<
cache_string
);
auto
&
plan
=
getOrCreatePlan
(
handle_
,
log_buf
,
opGraph
,
cache_string
);
DEBUG_CUDNN_MSG
(
log_buf
,
"Plan tag: "
<<
plan
.
getTag
());
auto
workspace_size
=
plan
.
getWorkspaceSize
();
DEBUG_CUDNN_MSG
(
log_buf
,
plan
.
describe
()
<<
" requires workspace "
<<
workspace_size
);
void
*
workspace_ptr
=
nullptr
;
auto
workspace_tensor
=
at
::
empty
({(
workspace_size
+
3
)
/
4
},
at
::
TensorOptions
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
if
(
workspace_size
>
0
)
{
workspace_ptr
=
workspace_tensor
.
data_ptr
<
float
>
();
}
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
,
devPtrZ
,
devPtrB
,
devPtrT
,
devPtrU
};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
,
'z'
,
'b'
,
't'
,
'u'
};
auto
variantPack
=
cudnn_frontend
::
VariantPackBuilder
()
.
setWorkspacePointer
(
workspace_ptr
)
.
setDataPointers
(
7
,
data_ptrs
)
.
setUids
(
7
,
uids
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
"variantPack "
<<
variantPack
.
describe
());
cudnnStatus_t
status
=
cudnnBackendExecute
(
handle_
,
plan
.
get_raw_desc
(),
variantPack
.
get_raw_desc
());
checkCudnnErr
(
status
);
cudnn_frontend
::
throw_if
([
status
]()
{
return
(
status
!=
CUDNN_STATUS_SUCCESS
);
},
"Plan execute error"
,
status
);
}
} catch (cudnn_frontend::cudnnException e) {
} catch (cudnn_frontend::cudnnException e) {
std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
}
}
}
}
void
void
run_conv_scale_bias
(
int64_t
*
x_dim_padded
,
run_dconv(int64_t* x_dim_padded,
int64_t
*
pad
,
int64_t* pad,
int64_t
*
convstride
,
int64_t* convstride,
int64_t
*
dilation
,
int64_t* dilation,
int64_t
*
w_dim_padded
,
int64_t* w_dim_padded,
int64_t
*
y_dim_padded
,
int64_t* y_dim_padded,
cudnnDataType_t
dataType
,
cudnnDataType_t dataType,
at
::
Half
*
devPtrX
,
at::Half* devPtrX,
at
::
Half
*
devPtrW
,
at::Half* devPtrW,
at
::
Half
*
devPtrY
,
at::Half* devPtrY,
at
::
Half
*
devPtrZ
,
cudnnBackendDescriptorType_t mode) {
at
::
Half
*
devPtrB
)
{
cudnnHandle_t handle_ = torch::native::getCudnnHandle();
cudnnHandle_t handle_ = torch::native::getCudnnHandle();
std::stringstream log_buf;
std::stringstream log_buf;
try {
try {
int convDim = 2;
int convDim = 2;
// Creates the necessary tensor descriptors
// Creates the necessary tensor descriptors
common_convbias
_descriptors
tensors
=
create_conv_
bias_add_act_
descriptors
(
dconv
_descriptors tensors = create_
d
conv_descriptors(
x_dim_padded, pad, convstride, dilation, w_dim_padded, y_dim_padded, dataType);
x_dim_padded, pad, convstride, dilation, w_dim_padded, y_dim_padded, dataType);
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
X_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG(log_buf, std::get<X_OR_DX_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
Y_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG(log_buf, std::get<DY_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
W_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG(log_buf, std::get<W_OR_DW_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
Z_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG(log_buf, std::get<SCALE_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
B_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG(log_buf, std::get<RELU_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTERADD_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG(log_buf, std::get<AFTER_DCONV_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTERBIAS_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG(log_buf, std::get<AFTER_DRELU_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTERCONV_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
OPTIONAL
>
(
tensors
).
describe
());
// Define the add operation
auto
scaleDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_MUL
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
scaleDesc
.
describe
());
// Define the bias operation
auto
addDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_ADD
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
addDesc
.
describe
());
// Define the convolution problem
// Define the convolution problem
auto convDesc = cudnn_frontend::ConvDescBuilder()
auto convDesc = cudnn_frontend::ConvDescBuilder()
...
@@ -1508,36 +927,31 @@ run_conv_scale_bias(int64_t* x_dim_padded,
...
@@ -1508,36 +927,31 @@ run_conv_scale_bias(int64_t* x_dim_padded,
float beta = 0.0f;
float beta = 0.0f;
// Create a convolution Node
// Create a convolution Node
auto
conv_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR
)
// mode should be one of following
.
setxDesc
(
std
::
get
<
X_TENSOR
>
(
tensors
))
// CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
.
setwDesc
(
std
::
get
<
W_TENSOR
>
(
tensors
))
// CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
.
setyDesc
(
std
::
get
<
AFTERCONV_TENSOR
>
(
tensors
))
auto conv_op_builder = cudnn_frontend::OperationBuilder(mode);
.
setcDesc
(
convDesc
)
if (mode == CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR) {
.
setAlpha
(
alpha
)
conv_op_builder.setdxDesc(std::get<X_OR_DX_TENSOR>(tensors))
.
setBeta
(
beta
)
.setwDesc(std::get<W_OR_DW_TENSOR>(tensors))
.
build
();
.setdyDesc(std::get<DY_TENSOR>(tensors))
.setcDesc(convDesc)
.setAlpha(alpha)
.setBeta(beta);
}
else {
conv_op_builder.setxDesc(std::get<X_OR_DX_TENSOR>(tensors))
.setdwDesc(std::get<W_OR_DW_TENSOR>(tensors))
.setdyDesc(std::get<DY_TENSOR>(tensors))
.setcDesc(convDesc)
.setAlpha(alpha)
.setBeta(beta);
}
auto conv_op = conv_op_builder.build();
DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
// Create a Add Node with scaling parameters.
auto
scale_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
conv_op
.
getOutputTensor
())
.
setbDesc
(
std
::
get
<
Z_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
AFTERADD_TENSOR
>
(
tensors
))
// TODO: change enum to aftermul
.
setpwDesc
(
scaleDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
scale_op
.
describe
());
// Create a Bias Node.
auto
add_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
scale_op
.
getOutputTensor
())
.
setbDesc
(
std
::
get
<
B_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
Y_TENSOR
>
(
tensors
))
.
setpwDesc
(
addDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
add_op
.
describe
());
// Create an Operation Graph. In this case it is convolution add bias activation
// Create an Operation Graph. In this case it is convolution add bias activation
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
3
>
ops
=
{
&
conv_op
,
&
scale_op
,
&
add_op
};
std::array<cudnn_frontend::Operation const*,
1
> ops = {&conv_op};
auto opGraph = cudnn_frontend::OperationGraphBuilder()
auto opGraph = cudnn_frontend::OperationGraphBuilder()
.setHandle(handle_)
.setHandle(handle_)
...
@@ -1559,13 +973,13 @@ run_conv_scale_bias(int64_t* x_dim_padded,
...
@@ -1559,13 +973,13 @@ run_conv_scale_bias(int64_t* x_dim_padded,
if (workspace_size > 0) {
if (workspace_size > 0) {
workspace_ptr = workspace_tensor.data_ptr<float>();
workspace_ptr = workspace_tensor.data_ptr<float>();
}
}
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
,
devPtrZ
,
devPtrB
};
void* data_ptrs[] = {devPtrX, devPtrY, devPtrW};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
,
'z'
,
'b'
};
int64_t uids[] = {'x', 'y', 'w'};
auto variantPack = cudnn_frontend::VariantPackBuilder()
auto variantPack = cudnn_frontend::VariantPackBuilder()
.
setWorkspacePointer
(
workspace_ptr
)
.setWorkspacePointer(workspace_ptr)
.
setDataPointers
(
5
,
data_ptrs
)
.setDataPointers(
3
, data_ptrs)
.
setUids
(
5
,
uids
)
.setUids(
3
, uids)
.
build
();
.build();
DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
checkCudnnErr(status);
checkCudnnErr(status);
...
@@ -1575,20 +989,18 @@ run_conv_scale_bias(int64_t* x_dim_padded,
...
@@ -1575,20 +989,18 @@ run_conv_scale_bias(int64_t* x_dim_padded,
}
}
}
}
void
void
run_dconv_drelu_dscale
(
int64_t
*
x_dim_padded
,
run_dconv_add(int64_t* x_dim_padded,
int64_t
*
pad
,
int64_t* pad,
int64_t
*
convstride
,
int64_t* convstride,
int64_t
*
dilation
,
int64_t* dilation,
int64_t
*
w_dim_padded
,
int64_t* w_dim_padded,
int64_t
*
y_dim_padded
,
int64_t* y_dim_padded,
cudnnDataType_t
dataType
,
cudnnDataType_t dataType,
at
::
Half
*
devPtrX
,
at::Half* devPtrX,
at
::
Half
*
devPtrW
,
at::Half* devPtrW,
at
::
Half
*
devPtrY
,
at::Half* devPtrY,
at
::
Half
*
devPtrZ
,
at::Half* devPtrR) {
at
::
Half
*
devPtrR
)
{
cudnnHandle_t handle_ = torch::native::getCudnnHandle();
cudnnHandle_t handle_ = torch::native::getCudnnHandle();
std::stringstream log_buf;
std::stringstream log_buf;
try {
try {
...
@@ -1617,19 +1029,12 @@ run_dconv_drelu_dscale(int64_t* x_dim_padded,
...
@@ -1617,19 +1029,12 @@ run_dconv_drelu_dscale(int64_t* x_dim_padded,
.build();
.build();
DEBUG_CUDNN_MSG(log_buf, convDesc.describe());
DEBUG_CUDNN_MSG(log_buf, convDesc.describe());
// Define the activation backward operation
// Define the add backward operation
auto
actDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
auto addDesc = cudnn_frontend::PointWiseDescBuilder()
.
setMode
(
CUDNN_POINTWISE_RELU_BWD
)
.setMode(CUDNN_POINTWISE_ADD)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
actDesc
.
describe
());
// Define the scale backward operation
auto
scaleDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_MUL
)
.setMathPrecision(CUDNN_DATA_FLOAT)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
.build();
DEBUG_CUDNN_MSG
(
log_buf
,
scale
Desc
.
describe
());
DEBUG_CUDNN_MSG(log_buf,
add
Desc.describe());
float alpha = 1.0f;
float alpha = 1.0f;
float beta = 0.0f;
float beta = 0.0f;
...
@@ -1646,26 +1051,17 @@ run_dconv_drelu_dscale(int64_t* x_dim_padded,
...
@@ -1646,26 +1051,17 @@ run_dconv_drelu_dscale(int64_t* x_dim_padded,
DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
// TODO: do we need getOutputTensor(), and what it returns in backward case?
// TODO: do we need getOutputTensor(), and what it returns in backward case?
// Create an relu backward Node.
// Create add Node.
auto
act_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.
setdyDesc
(
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
))
.setxDesc(std::get<AFTER_DCONV_TENSOR>(tensors))
.
setxDesc
(
std
::
get
<
RELU_TENSOR
>
(
tensors
))
.setbDesc(std::get<RELU_TENSOR>(tensors))
.
setdxDesc
(
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
))
.
setpwDesc
(
actDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
act_op
.
describe
());
// Create a Scale Node.
auto
scale_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
.
setxDesc
(
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
))
.
setbDesc
(
std
::
get
<
SCALE_TENSOR
>
(
tensors
))
.setyDesc(std::get<X_OR_DX_TENSOR>(tensors))
.setyDesc(std::get<X_OR_DX_TENSOR>(tensors))
.
setpwDesc
(
scale
Desc
)
.setpwDesc(
add
Desc)
.build();
.build();
DEBUG_CUDNN_MSG
(
log_buf
,
scale
_op
.
describe
());
DEBUG_CUDNN_MSG(log_buf,
add
_op.describe());
// Create an Operation Graph. In this case it is convolution add bias activation
// Create an Operation Graph. In this case it is convolution add bias activation
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
3
>
ops
=
{
&
conv_op
,
&
a
ct_op
,
&
scale
_op
};
std::array<cudnn_frontend::Operation const*,
2
> ops = {&conv_op, &a
dd
_op};
auto opGraph = cudnn_frontend::OperationGraphBuilder()
auto opGraph = cudnn_frontend::OperationGraphBuilder()
.setHandle(handle_)
.setHandle(handle_)
...
@@ -1687,12 +1083,12 @@ run_dconv_drelu_dscale(int64_t* x_dim_padded,
...
@@ -1687,12 +1083,12 @@ run_dconv_drelu_dscale(int64_t* x_dim_padded,
if (workspace_size > 0) {
if (workspace_size > 0) {
workspace_ptr = workspace_tensor.data_ptr<float>();
workspace_ptr = workspace_tensor.data_ptr<float>();
}
}
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
,
devPtrZ
,
devPtrR
};
void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrR};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
,
's'
,
'r'
};
int64_t uids[] = {'x', 'y', 'w', 'r'};
auto variantPack = cudnn_frontend::VariantPackBuilder()
auto variantPack = cudnn_frontend::VariantPackBuilder()
.setWorkspacePointer(workspace_ptr)
.setWorkspacePointer(workspace_ptr)
.
setDataPointers
(
5
,
data_ptrs
)
.setDataPointers(
4
, data_ptrs)
.
setUids
(
5
,
uids
)
.setUids(
4
, uids)
.build();
.build();
DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
...
@@ -1703,496 +1099,1467 @@ run_dconv_drelu_dscale(int64_t* x_dim_padded,
...
@@ -1703,496 +1099,1467 @@ run_dconv_drelu_dscale(int64_t* x_dim_padded,
}
}
}
}
void
run_dconv_add_drelu_dscale
(
int64_t
*
x_dim_padded
,
int64_t
*
pad
,
int64_t
*
convstride
,
int64_t
*
dilation
,
int64_t
*
w_dim_padded
,
int64_t
*
y_dim_padded
,
cudnnDataType_t
dataType
,
at
::
Half
*
devPtrX
,
at
::
Half
*
devPtrW
,
at
::
Half
*
devPtrY
,
at
::
Half
*
devPtrZ
,
at
::
Half
*
devPtrR
,
at
::
Half
*
devPtrI
)
{
cudnnHandle_t
handle_
=
torch
::
native
::
getCudnnHandle
();
std
::
stringstream
log_buf
;
try
{
int
convDim
=
2
;
// Creates the necessary tensor descriptors
// inputs contains x,w,z,b,(i)
dconv_descriptors
tensors
=
create_dconv_descriptors
(
std::vector<at::Tensor> bottleneck_forward(bool explicit_nhwc, int stride_1X1, std::vector<at::Tensor> inputs) {
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
y_dim_padded
,
dataType
);
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DY_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
W_OR_DW_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
SCALE_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
RELU_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_INPUT_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_OPTIONAL_TENSOR
>
(
tensors
).
describe
());
// Define the convolution problem
std::cout << std::fixed;
auto
convDesc
=
cudnn_frontend
::
ConvDescBuilder
()
// create output vector
.
setDataType
(
CUDNN_DATA_FLOAT
)
std::vector<at::Tensor> outputs;
.
setMathMode
(
CUDNN_CROSS_CORRELATION
)
auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
.
setNDims
(
convDim
)
.
setStrides
(
convDim
,
convstride
)
.
setPrePadding
(
convDim
,
pad
)
.
setPostPadding
(
convDim
,
pad
)
.
setDilation
(
convDim
,
dilation
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
convDesc
.
describe
());
// optional add
// setup dimensions
auto
addDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
int64_t dimA[] = {0, 0, 0, 0};
.
setMode
(
CUDNN_POINTWISE_ADD
)
int64_t filterdimA1[] = {0, 0, 0, 0};
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
int64_t filterdimA2[] = {0, 0, 0, 0};
.
build
()
;
int64_t filterdimA3[] = {0, 0, 0, 0}
;
DEBUG_CUDNN_MSG
(
log_buf
,
addDesc
.
describe
())
;
int64_t filterdimA4[] = {0, 0, 0, 0}
;
// Define the activation backward operation
// All dim calculation after this order of n,c,h,w
auto
actDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
int axis[] {0,1,2,3};
.
setMode
(
CUDNN_POINTWISE_RELU_BWD
)
if (explicit_nhwc) {
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
axis[0] = 0;
.
build
();
axis[1] = 3;
DEBUG_CUDNN_MSG
(
log_buf
,
actDesc
.
describe
());
axis[2] = 1;
axis[3] = 2;
}
for (int dim=0;dim<4;dim++) {
dimA[dim] = inputs[0].size(axis[dim]);
filterdimA1[dim] = inputs[1].size(axis[dim]);
filterdimA2[dim] = inputs[2].size(axis[dim]);
filterdimA3[dim] = inputs[3].size(axis[dim]);
}
if (stride_1X1 != 1 || filterdimA3[0] != dimA[1]) {
for (int dim=0;dim<4;dim++) {
filterdimA4[dim] = inputs[10].size(axis[dim]);
}
}
// Define the scale backward operation
// output dim in n,c,h,w used by backend
auto
scaleDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
int64_t outdimA1[] = {0, 0, 0, 0}; // Computed Below
.
setMode
(
CUDNN_POINTWISE_MUL
)
int64_t outdimA2[] = {0, 0, 0, 0}; // Computed Below
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
int64_t outdimA3[] = {0, 0, 0, 0}; // Computed Below
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
scaleDesc
.
describe
());
float
alpha
=
1.0
f
;
// use these fixed value for test run
float
beta
=
0.0
f
;
int64_t padA[] = {0, 0};
int64_t padA1[] = {1, 1};
int64_t dilationA[] = {1, 1};
int64_t convstrideA[] = {1, 1};
int64_t convstride1X1[] = {stride_1X1, stride_1X1};
// Create a convolution Node
// compute output from pad/stride/dilation
auto
conv_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
)
outdimA1[0] = dimA[0];
.
setdxDesc
(
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
))
outdimA1[1] = filterdimA1[0];
.
setwDesc
(
std
::
get
<
W_OR_DW_TENSOR
>
(
tensors
))
for (int dim = 0; dim < 2; dim++) {
.
setdyDesc
(
std
::
get
<
DY_TENSOR
>
(
tensors
))
outdimA1[dim + 2] = getFwdConvOutputDim(dimA[dim + 2], padA[dim], filterdimA1[dim + 2], convstride1X1[dim], dilationA[dim]);
.
setcDesc
(
convDesc
)
}
.
setAlpha
(
alpha
)
.
setBeta
(
beta
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
conv_op
.
describe
());
// Create add Node.
outdimA2[0] = outdimA1[0];
auto
add_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
outdimA2[1] = filterdimA2[0];
.
setxDesc
(
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
))
for (int dim = 0; dim < 2; dim++) {
.
setbDesc
(
std
::
get
<
DGRAD_INPUT_TENSOR
>
(
tensors
))
outdimA2[dim + 2] = getFwdConvOutputDim(outdimA1[dim + 2], padA1[dim], filterdimA2[dim + 2], convstrideA[dim], dilationA[dim]);
.
setyDesc
(
std
::
get
<
DGRAD_OPTIONAL_TENSOR
>
(
tensors
))
}
.
setpwDesc
(
addDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
add_op
.
describe
());
// TODO: do we need getOutputTensor(), and what it returns in backward case?
outdimA3[0] = outdimA2[0];
// Create an relu backward Node.
outdimA3[1] = filterdimA3[0];
auto
act_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
for (int dim = 0; dim < 2; dim++) {
.
setdyDesc
(
std
::
get
<
DGRAD_OPTIONAL_TENSOR
>
(
tensors
))
outdimA3[dim + 2] = getFwdConvOutputDim(outdimA2[dim + 2], padA[dim], filterdimA3[dim + 2], convstrideA[dim], dilationA[dim]);
.
setxDesc
(
std
::
get
<
RELU_TENSOR
>
(
tensors
))
}
.
setdxDesc
(
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
))
.
setpwDesc
(
actDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
act_op
.
describe
());
// Create a Scale Node.
// Create output tensor in the correct shape in pytorch's view
auto
scale_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
int64_t outdim1[] = {0, 0, 0, 0};
.
setxDesc
(
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
))
int64_t outdim2[] = {0, 0, 0, 0};
.
setbDesc
(
std
::
get
<
SCALE_TENSOR
>
(
tensors
))
int64_t outdim3[] = {0, 0, 0, 0};
.
setyDesc
(
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
))
if (explicit_nhwc) {
.
setpwDesc
(
scaleDesc
)
axis[0] = 0;
.
build
();
axis[1] = 2;
DEBUG_CUDNN_MSG
(
log_buf
,
scale_op
.
describe
());
axis[2] = 3;
axis[3] = 1;
}
for (int dim=0;dim<4;dim++) {
outdim1[dim] = outdimA1[axis[dim]];
outdim2[dim] = outdimA2[axis[dim]];
outdim3[dim] = outdimA3[axis[dim]];
}
// Create an Operation Graph. In this case it is convolution add bias activation
// run
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
4
>
ops
=
{
&
conv_op
,
&
add_op
,
&
act_op
,
&
scale_op
};
at::Half* x = inputs[0].data_ptr<at::Half>();
at::Half* w = inputs[1].data_ptr<at::Half>();
at::Half* z = inputs[4].data_ptr<at::Half>();
at::Half* b = inputs[7].data_ptr<at::Half>();
auto out1 = at::empty(outdim1, inputs[0].type(), output_format);
at::Half* y1 = out1.data_ptr<at::Half>();
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
run_conv_scale_bias_add_activation(dimA,
.
setHandle
(
handle_
)
padA,
.
setOperationGraph
(
ops
.
size
(),
ops
.
data
())
convstride1X1,
.
build
();
dilationA,
filterdimA1,
outdimA1,
CUDNN_DATA_HALF,
x,
w,
y1,
z,
b,
nullptr);
// Create string encoding for plan caching
DEBUG_MSG("[DEBUG] new relu1 : " << out1.to(at::kFloat).sum().item<float>());
auto
cache_string
=
getConvFusionString
(
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
dataType
,
opGraph
.
getTag
());
DEBUG_CUDNN_MSG
(
log_buf
,
"[convstring] "
<<
cache_string
);
auto
&
plan
=
getOrCreatePlan
(
handle_
,
log_buf
,
opGraph
,
cache_string
);
w = inputs[2].data_ptr<at::Half>();
DEBUG_CUDNN_MSG
(
log_buf
,
"Plan tag: "
<<
plan
.
getTag
());
z = inputs[5].data_ptr<at::Half>();
b = inputs[8].data_ptr<at::Half>();
auto out2 = at::empty(outdim2, inputs[0].type(), output_format);
at::Half* y2 = out2.data_ptr<at::Half>();
auto
workspace_size
=
plan
.
getWorkspaceSize
();
run_conv_scale_bias_add_activation(outdimA1,
DEBUG_CUDNN_MSG
(
log_buf
,
plan
.
describe
()
<<
" requires workspace "
<<
workspace_size
);
padA1,
convstrideA,
dilationA,
filterdimA2,
outdimA2,
CUDNN_DATA_HALF,
y1,
w,
y2,
z,
b,
nullptr);
DEBUG_MSG("[DEBUG] new relu2 : " << out2.to(at::kFloat).sum().item<float>());
void
*
workspace_ptr
=
nullptr
;
// create output of conv3
auto
workspace_tensor
=
at
::
empty
({(
workspace_size
+
3
)
/
4
},
at
::
TensorOptions
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
auto out3 = at::empty(outdim3, inputs[0].type(), output_format);
if
(
workspace_size
>
0
)
{
at::Half* y3 = out3.data_ptr<at::Half>();
workspace_ptr
=
workspace_tensor
.
data_ptr
<
float
>
();
}
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
,
devPtrZ
,
devPtrR
,
devPtrI
};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
,
's'
,
'r'
,
'i'
};
auto
variantPack
=
cudnn_frontend
::
VariantPackBuilder
()
.
setWorkspacePointer
(
workspace_ptr
)
.
setDataPointers
(
6
,
data_ptrs
)
.
setUids
(
6
,
uids
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
"variantPack "
<<
variantPack
.
describe
());
cudnnStatus_t
status
=
cudnnBackendExecute
(
handle_
,
plan
.
get_raw_desc
(),
variantPack
.
get_raw_desc
());
checkCudnnErr
(
status
);
cudnn_frontend
::
throw_if
([
status
]()
{
return
(
status
!=
CUDNN_STATUS_SUCCESS
);
},
"Plan execute error"
,
status
);
}
catch
(
cudnn_frontend
::
cudnnException
e
)
{
std
::
cout
<<
log_buf
.
str
()
<<
"[ERROR] Exception "
<<
e
.
what
()
<<
std
::
endl
;
}
}
void
// create output of conv4 that may exist
run_dconv_drelu_dscale_mask
(
int64_t
*
x_dim_padded
,
auto identity = at::empty_like(out3);
int64_t
*
pad
,
at::Half* yi = identity.data_ptr<at::Half>();
int64_t
*
convstride
,
int64_t
*
dilation
,
int64_t
*
w_dim_padded
,
int64_t
*
y_dim_padded
,
int64_t
*
threshold_dim
,
cudnnDataType_t
dataType
,
at
::
Half
*
devPtrX
,
at
::
Half
*
devPtrW
,
at
::
Half
*
devPtrY
,
at
::
Half
*
devPtrZ
,
at
::
Half
*
devPtrR
,
int
*
devPtrT
,
int
*
devPtrU
,
int
axis
)
{
cudnnHandle_t
handle_
=
torch
::
native
::
getCudnnHandle
();
std
::
stringstream
log_buf
;
try
{
int
convDim
=
2
;
// Creates the necessary tensor descriptors
if (stride_1X1 != 1 || filterdimA3[0] != dimA[1]){
dconv_mask_descriptors
tensors
=
create_dconv_mask_descriptors
(
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
y_dim_padded
,
threshold_dim
,
dataType
);
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DY_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
W_OR_DW_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
SCALE_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
RELU_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_OPTIONAL_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_GEN_INDEX_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_MASK_TOP_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_MASK_BOTTOM_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_MASK_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_THRESHOLD_TOP_TENSOR
>
(
tensors
).
describe
());
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DGRAD_THRESHOLD_BOTTOM_TENSOR
>
(
tensors
).
describe
());
// Define the convolution problem
w = inputs[10].data_ptr<at::Half>();
auto
convDesc
=
cudnn_frontend
::
ConvDescBuilder
()
z = inputs[11].data_ptr<at::Half>();
.
setDataType
(
CUDNN_DATA_FLOAT
)
b = inputs[12].data_ptr<at::Half>();
.
setMathMode
(
CUDNN_CROSS_CORRELATION
)
run_conv_scale_bias(dimA,
.
setNDims
(
convDim
)
padA,
.
setStrides
(
convDim
,
convstride
)
convstride1X1,
.
setPrePadding
(
convDim
,
pad
)
dilationA,
.
setPostPadding
(
convDim
,
pad
)
filterdimA4,
.
setDilation
(
convDim
,
dilation
)
outdimA3,
.
build
();
CUDNN_DATA_HALF,
DEBUG_CUDNN_MSG
(
log_buf
,
convDesc
.
describe
());
x,
w,
yi,
z,
b);
DEBUG_MSG("[DEBUG] new downsample : " << identity.to(at::kFloat).sum().item<float>());
}
else {
yi = x;
}
// Define the activation backward operation
w = inputs[3].data_ptr<at::Half>();
auto
actDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
z = inputs[6].data_ptr<at::Half>();
.
setMode
(
CUDNN_POINTWISE_RELU_BWD
)
b = inputs[9].data_ptr<at::Half>();
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
actDesc
.
describe
());
// Define the scale backward operation
run_conv_scale_bias_add_activation(outdimA2,
auto
scaleDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
padA,
.
setMode
(
CUDNN_POINTWISE_MUL
)
convstrideA,
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
dilationA,
.
build
();
filterdimA3,
DEBUG_CUDNN_MSG
(
log_buf
,
scaleDesc
.
describe
());
outdimA3,
CUDNN_DATA_HALF,
y2,
w,
y3,
z,
b,
yi);
DEBUG_MSG("[DEBUG] new relu3 : " << out3.to(at::kFloat).sum().item<float>());
// Define the genIndex descriptor
outputs.push_back(out1);
auto
genIndexDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
outputs.push_back(out2);
.
setMode
(
CUDNN_POINTWISE_GEN_INDEX
)
outputs.push_back(out3);
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
setAxis
(
axis
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
genIndexDesc
.
describe
());
// Define the lessThan descriptor
return outputs;
auto
lessThanDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
}
.
setMode
(
CUDNN_POINTWISE_CMP_LT
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
lessThanDesc
.
describe
());
// Define the greaterThan descriptor
std::vector<at::Tensor> bottleneck_backward(bool explicit_nhwc, int stride_1X1, std::vector<at::Tensor> inputs) {
auto
greaterThanDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_CMP_GT
)
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
greaterThanDesc
.
describe
());
// Define the logical_or descriptor
bool requires_grad = inputs[0].requires_grad();
auto
logicalOrDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
.
setMode
(
CUDNN_POINTWISE_LOGICAL_OR
)
.
setMathPrecision
(
CUDNN_DATA_BOOLEAN
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
logicalOrDesc
.
describe
());
// Define the binary_selection descriptor
std::cout << std::fixed;
auto
selectionDesc
=
cudnn_frontend
::
PointWiseDescBuilder
()
// create output vector
.
setMode
(
CUDNN_POINTWISE_BINARY_SELECT
)
std::vector<at::Tensor> outputs;
.
setMathPrecision
(
CUDNN_DATA_FLOAT
)
auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
selectionDesc
.
describe
());
float
alpha
=
1.0
f
;
// setup dimensions
float
beta
=
0.0
f
;
int64_t dimA[] = {0, 0, 0, 0};
int64_t filterdimA1[] = {0, 0, 0, 0};
int64_t filterdimA2[] = {0, 0, 0, 0};
int64_t filterdimA3[] = {0, 0, 0, 0};
int64_t filterdimA4[] = {0, 0, 0, 0};
// Create a convolution Node
// All dim calculation after this order of n,c,h,w
auto
conv_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
)
int axis[] {0,1,2,3};
.
setdxDesc
(
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
))
if (explicit_nhwc) {
.
setwDesc
(
std
::
get
<
W_OR_DW_TENSOR
>
(
tensors
))
axis[0] = 0;
.
setdyDesc
(
std
::
get
<
DY_TENSOR
>
(
tensors
))
axis[1] = 3;
.
setcDesc
(
convDesc
)
axis[2] = 1;
.
setAlpha
(
alpha
)
axis[3] = 2;
.
setBeta
(
beta
)
}
.
build
();
for (int dim=0;dim<4;dim++) {
DEBUG_CUDNN_MSG
(
log_buf
,
conv_op
.
describe
());
dimA[dim] = inputs[0].size(axis[dim]);
filterdimA1[dim] = inputs[1].size(axis[dim]);
filterdimA2[dim] = inputs[2].size(axis[dim]);
filterdimA3[dim] = inputs[3].size(axis[dim]);
}
if (stride_1X1 != 1 || filterdimA3[0] != dimA[1]) {
for (int dim=0;dim<4;dim++) {
filterdimA4[dim] = inputs[14].size(axis[dim]);
}
}
// TODO: do we need getOutputTensor(), and what it returns in backward case?
// output dim in n,c,h,w used by backend
// Create an relu backward Node.
int64_t outdimA1[] = {0, 0, 0, 0}; // Computed Below
auto
act_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
int64_t outdimA2[] = {0, 0, 0, 0}; // Computed Below
.
setdyDesc
(
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
))
int64_t outdimA3[] = {0, 0, 0, 0}; // Computed Below
.
setxDesc
(
std
::
get
<
RELU_TENSOR
>
(
tensors
))
.
setdxDesc
(
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
))
.
setpwDesc
(
actDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
act_op
.
describe
());
// Create a Scale Node.
// use these fixed value for test run
auto
scale_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
int64_t padA[] = {0, 0};
.
setxDesc
(
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
))
int64_t padA1[] = {1, 1};
.
setbDesc
(
std
::
get
<
SCALE_TENSOR
>
(
tensors
))
int64_t dilationA[] = {1, 1};
.
setyDesc
(
std
::
get
<
DGRAD_OPTIONAL_TENSOR
>
(
tensors
))
int64_t convstrideA[] = {1, 1};
.
setpwDesc
(
scaleDesc
)
int64_t convstride1X1[] = {stride_1X1, stride_1X1};
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
scale_op
.
describe
());
// Create a Gen_Index Node.
// compute output from pad/stride/dilation
auto
genIndex_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
outdimA1[0] = dimA[0];
.
setxDesc
(
std
::
get
<
DGRAD_OPTIONAL_TENSOR
>
(
tensors
))
outdimA1[1] = filterdimA1[0];
.
setyDesc
(
std
::
get
<
DGRAD_GEN_INDEX_TENSOR
>
(
tensors
))
for (int dim = 0; dim < 2; dim++) {
.
setpwDesc
(
genIndexDesc
)
outdimA1[dim + 2] = getFwdConvOutputDim(dimA[dim + 2], padA[dim], filterdimA1[dim + 2], convstride1X1[dim], dilationA[dim]);
.
build
();
}
DEBUG_CUDNN_MSG
(
log_buf
,
genIndex_op
.
describe
());
// Create a LessThan Node.
outdimA2[0] = outdimA1[0];
auto
lessThan_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
outdimA2[1] = filterdimA2[0];
.
setxDesc
(
std
::
get
<
DGRAD_GEN_INDEX_TENSOR
>
(
tensors
))
for (int dim = 0; dim < 2; dim++) {
.
setbDesc
(
std
::
get
<
DGRAD_THRESHOLD_TOP_TENSOR
>
(
tensors
))
outdimA2[dim + 2] = getFwdConvOutputDim(outdimA1[dim + 2], padA1[dim], filterdimA2[dim + 2], convstrideA[dim], dilationA[dim]);
.
setyDesc
(
std
::
get
<
DGRAD_MASK_TOP_TENSOR
>
(
tensors
))
}
.
setpwDesc
(
lessThanDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
lessThan_op
.
describe
());
// Create a GreaterThan Node.
outdimA3[0] = outdimA2[0];
auto
greaterThan_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
outdimA3[1] = filterdimA3[0];
.
setxDesc
(
std
::
get
<
DGRAD_GEN_INDEX_TENSOR
>
(
tensors
))
for (int dim = 0; dim < 2; dim++) {
.
setbDesc
(
std
::
get
<
DGRAD_THRESHOLD_BOTTOM_TENSOR
>
(
tensors
))
outdimA3[dim + 2] = getFwdConvOutputDim(outdimA2[dim + 2], padA[dim], filterdimA3[dim + 2], convstrideA[dim], dilationA[dim]);
.
setyDesc
(
std
::
get
<
DGRAD_MASK_BOTTOM_TENSOR
>
(
tensors
))
}
.
setpwDesc
(
greaterThanDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
greaterThan_op
.
describe
());
// Create a LogicalOr Node.
// Create output tensor in the correct shape in pytorch's view
auto
logicalOr_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
int64_t outdim1[] = {0, 0, 0, 0};
.
setxDesc
(
std
::
get
<
DGRAD_MASK_TOP_TENSOR
>
(
tensors
))
int64_t outdim2[] = {0, 0, 0, 0};
.
setbDesc
(
std
::
get
<
DGRAD_MASK_BOTTOM_TENSOR
>
(
tensors
))
int64_t outdim3[] = {0, 0, 0, 0};
.
setyDesc
(
std
::
get
<
DGRAD_MASK_TENSOR
>
(
tensors
))
if (explicit_nhwc) {
.
setpwDesc
(
logicalOrDesc
)
axis[0] = 0;
.
build
();
axis[1] = 2;
DEBUG_CUDNN_MSG
(
log_buf
,
logicalOr_op
.
describe
());
axis[2] = 3;
axis[3] = 1;
}
for (int dim=0;dim<4;dim++) {
outdim1[dim] = outdimA1[axis[dim]];
outdim2[dim] = outdimA2[axis[dim]];
outdim3[dim] = outdimA3[axis[dim]];
}
// Create a Binary_Selection Node.
// dconv3+drelu2+dscale2
auto
selection_op
=
cudnn_frontend
::
OperationBuilder
(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
)
at::Half* conv_in = inputs[13].data_ptr<at::Half>();
.
setxDesc
(
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
))
at::Half* dy3 = inputs[10].data_ptr<at::Half>();
.
setbDesc
(
std
::
get
<
DGRAD_OPTIONAL_TENSOR
>
(
tensors
))
.
settDesc
(
std
::
get
<
DGRAD_MASK_TENSOR
>
(
tensors
))
.
setyDesc
(
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
))
.
setpwDesc
(
selectionDesc
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
selection_op
.
describe
());
// Create an Operation Graph. In this case it is convolution add bias activation
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
8
>
ops
=
{
&
conv_op
,
&
act_op
,
&
scale_op
,
&
genIndex_op
,
&
lessThan_op
,
&
greaterThan_op
,
&
logicalOr_op
,
&
selection_op
};
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
.
setHandle
(
handle_
)
.
setOperationGraph
(
ops
.
size
(),
ops
.
data
())
.
build
();
// Create string encoding for plan caching
auto
cache_string
=
getConvFusionString
(
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
dataType
,
opGraph
.
getTag
());
DEBUG_CUDNN_MSG
(
log_buf
,
"[convstring] "
<<
cache_string
);
auto
&
plan
=
getOrCreatePlan
(
handle_
,
log_buf
,
opGraph
,
cache_string
);
DEBUG_MSG("[DEBUG] new dconv3 : " << inputs[10].to(at::kFloat).sum().item<float>());
DEBUG_CUDNN_MSG
(
log_buf
,
"Plan tag: "
<<
plan
.
getTag
());
auto
workspace_size
=
plan
.
getWorkspaceSize
();
// wgrad
DEBUG_CUDNN_MSG
(
log_buf
,
plan
.
describe
()
<<
" requires workspace "
<<
workspace_size
);
auto wgrad3 = at::empty_like(inputs[3]);
at::Half* dw3 = wgrad3.data_ptr<at::Half>();
run_dconv(outdimA2,
padA,
convstrideA,
dilationA,
filterdimA3,
outdimA3,
CUDNN_DATA_HALF,
conv_in,
dw3,
dy3,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
void
*
workspace_ptr
=
nullptr
;
// dgrad
auto
workspace_tensor
=
at
::
empty
({(
workspace_size
+
3
)
/
4
},
at
::
TensorOptions
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
auto grad_out2 = at::empty(outdim2, inputs[0].type(), output_format);
if
(
workspace_size
>
0
)
{
at::Half* dy2 = grad_out2.data_ptr<at::Half>();
workspace_ptr
=
workspace_tensor
.
data_ptr
<
float
>
();
at::Half* w = inputs[3].data_ptr<at::Half>();
}
at::Half* z = inputs[5].data_ptr<at::Half>();
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
,
devPtrZ
,
devPtrR
,
devPtrT
,
devPtrU
};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
,
's'
,
'r'
,
't'
,
'u'
};
auto
variantPack
=
cudnn_frontend
::
VariantPackBuilder
()
.
setWorkspacePointer
(
workspace_ptr
)
.
setDataPointers
(
7
,
data_ptrs
)
.
setUids
(
7
,
uids
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
"variantPack "
<<
variantPack
.
describe
());
cudnnStatus_t
status
=
cudnnBackendExecute
(
handle_
,
plan
.
get_raw_desc
(),
variantPack
.
get_raw_desc
());
checkCudnnErr
(
status
);
cudnn_frontend
::
throw_if
([
status
]()
{
return
(
status
!=
CUDNN_STATUS_SUCCESS
);
},
"Plan execute error"
,
status
);
}
catch
(
cudnn_frontend
::
cudnnException
e
)
{
std
::
cout
<<
log_buf
.
str
()
<<
"[ERROR] Exception "
<<
e
.
what
()
<<
std
::
endl
;
}
}
void
at::Half* relu2 = inputs[13].data_ptr<at::Half>();
run_dconv
(
int64_t
*
x_dim_padded
,
int64_t
*
pad
,
int64_t
*
convstride
,
int64_t
*
dilation
,
int64_t
*
w_dim_padded
,
int64_t
*
y_dim_padded
,
cudnnDataType_t
dataType
,
at
::
Half
*
devPtrX
,
at
::
Half
*
devPtrW
,
at
::
Half
*
devPtrY
,
cudnnBackendDescriptorType_t
mode
)
{
cudnnHandle_t
handle_
=
torch
::
native
::
getCudnnHandle
();
std
::
stringstream
log_buf
;
try
{
int
convDim
=
2
;
// Creates the necessary tensor descriptors
run_dconv_drelu_dscale(outdimA2,
dconv_descriptors
tensors
=
create_dconv_descriptors
(
padA,
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
y_dim_padded
,
dataType
);
convstrideA,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
).
describe
());
dilationA,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
DY_TENSOR
>
(
tensors
).
describe
());
filterdimA3,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
W_OR_DW_TENSOR
>
(
tensors
).
describe
());
outdimA3,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
SCALE_TENSOR
>
(
tensors
).
describe
());
CUDNN_DATA_HALF,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
RELU_TENSOR
>
(
tensors
).
describe
());
dy2,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
).
describe
());
w,
DEBUG_CUDNN_MSG
(
log_buf
,
std
::
get
<
AFTER_DRELU_TENSOR
>
(
tensors
).
describe
());
dy3,
z,
relu2);
// Define the convolution problem
DEBUG_MSG("[DEBUG] new dconv2 : " << grad_out2.to(at::kFloat).sum().item<float>());
auto
convDesc
=
cudnn_frontend
::
ConvDescBuilder
()
.
setDataType
(
CUDNN_DATA_FLOAT
)
.
setMathMode
(
CUDNN_CROSS_CORRELATION
)
.
setNDims
(
convDim
)
.
setStrides
(
convDim
,
convstride
)
.
setPrePadding
(
convDim
,
pad
)
.
setPostPadding
(
convDim
,
pad
)
.
setDilation
(
convDim
,
dilation
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
convDesc
.
describe
());
float
alpha
=
1.0
f
;
// dconv2+drelu1+dscale1
float
beta
=
0.0
f
;
conv_in = inputs[12].data_ptr<at::Half>()
;
// Create a convolution Node
// wgrad
// mode should be one of following
auto wgrad2 = at::empty_like(inputs[2]);
// CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
at::Half* dw2 = wgrad2.data_ptr<at::Half>();
// CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
run_dconv(outdimA1,
auto
conv_op_builder
=
cudnn_frontend
::
OperationBuilder
(
mode
);
padA1,
if
(
mode
==
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
)
{
convstrideA,
conv_op_builder
.
setdxDesc
(
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
))
dilationA,
.
setwDesc
(
std
::
get
<
W_OR_DW_TENSOR
>
(
tensors
))
filterdimA2,
.
setdyDesc
(
std
::
get
<
DY_TENSOR
>
(
tensors
))
outdimA2,
.
setcDesc
(
convDesc
)
CUDNN_DATA_HALF,
.
setAlpha
(
alpha
)
conv_in,
.
setBeta
(
beta
);
dw2,
}
dy2,
else
{
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
conv_op_builder
.
setxDesc
(
std
::
get
<
X_OR_DX_TENSOR
>
(
tensors
))
.
setdwDesc
(
std
::
get
<
W_OR_DW_TENSOR
>
(
tensors
))
.
setdyDesc
(
std
::
get
<
DY_TENSOR
>
(
tensors
))
.
setcDesc
(
convDesc
)
.
setAlpha
(
alpha
)
.
setBeta
(
beta
);
}
auto
conv_op
=
conv_op_builder
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
conv_op
.
describe
());
// Create an Operation Graph. In this case it is convolution add bias activation
// dgrad
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
1
>
ops
=
{
&
conv_op
};
auto grad_out1 = at::empty(outdim1, inputs[0].type(), output_format);
at::Half* dy1 = grad_out1.data_ptr<at::Half>();
w = inputs[2].data_ptr<at::Half>();
z = inputs[4].data_ptr<at::Half>();
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
at::Half* relu1 = inputs[12].data_ptr<at::Half>();
.
setHandle
(
handle_
)
// fused dgrad
.
setOperationGraph
(
ops
.
size
(),
ops
.
data
())
run_dconv_drelu_dscale(outdimA1,
.
build
();
padA1,
convstrideA,
dilationA,
filterdimA2,
outdimA2,
CUDNN_DATA_HALF,
dy1,
w,
dy2,
z,
relu1);
// Create string encoding for plan caching
/*
auto
cache_string
=
getConvFusionString
(
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
dataType
,
opGraph
.
getTag
());
// backward strided conv cannot be fused
DEBUG_CUDNN_MSG
(
log_buf
,
"[convstring] "
<<
cache_string
);
// if stride == 1 but channel changes, we can fuse here
if (stride_1X1 != 1){
// dgrad
run_dconv(outdimA1,
padA1,
convstride1X1,
dilationA,
filterdimA2,
outdimA2,
CUDNN_DATA_HALF,
dy1,
w,
dy2,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);
auto
&
plan
=
getOrCreatePlan
(
handle_
,
log_buf
,
opGraph
,
cache_string
);
// mul fused mask
DEBUG_CUDNN_MSG
(
log_buf
,
"Plan tag: "
<<
plan
.
getTag
());
grad_out1.mul_(inputs[15]);
}
else {
at::Half* relu1 = inputs[12].data_ptr<at::Half>();
// fused dgrad
run_dconv_drelu_dscale(outdimA1,
padA1,
convstride1X1,
dilationA,
filterdimA2,
outdimA2,
CUDNN_DATA_HALF,
dy1,
w,
dy2,
z,
relu1);
}
*/
DEBUG_MSG("[DEBUG] new dconv1 : " << grad_out1.to(at::kFloat).sum().item<float>());
auto
workspace_size
=
plan
.
getWorkspaceSize
();
// create grads of conv4 that may exist
DEBUG_CUDNN_MSG
(
log_buf
,
plan
.
describe
()
<<
" requires workspace "
<<
workspace_size
);
auto grad_x_conv4 = at::empty_like(inputs[0]);
at::Half* dx_conv4 = grad_x_conv4.data_ptr<at::Half>();
at::Tensor wgrad4;
void
*
workspace_ptr
=
nullptr
;
// x used for dconv1 and dconv4 wgrad
auto
workspace_tensor
=
at
::
empty
({(
workspace_size
+
3
)
/
4
},
at
::
TensorOptions
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
at::Half* x = inputs[0].data_ptr<at::Half>();
if
(
workspace_size
>
0
)
{
workspace_ptr
=
workspace_tensor
.
data_ptr
<
float
>
();
}
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
};
auto
variantPack
=
cudnn_frontend
::
VariantPackBuilder
()
.
setWorkspacePointer
(
workspace_ptr
)
.
setDataPointers
(
3
,
data_ptrs
)
.
setUids
(
3
,
uids
)
.
build
();
DEBUG_CUDNN_MSG
(
log_buf
,
"variantPack "
<<
variantPack
.
describe
());
cudnnStatus_t
status
=
cudnnBackendExecute
(
handle_
,
plan
.
get_raw_desc
(),
variantPack
.
get_raw_desc
());
checkCudnnErr
(
status
);
cudnn_frontend
::
throw_if
([
status
]()
{
return
(
status
!=
CUDNN_STATUS_SUCCESS
);
},
"Plan execute error"
,
status
);
}
catch
(
cudnn_frontend
::
cudnnException
e
)
{
std
::
cout
<<
log_buf
.
str
()
<<
"[ERROR] Exception "
<<
e
.
what
()
<<
std
::
endl
;
}
}
void
if (stride_1X1 != 1 || filterdimA3[0] != dimA[1]){
run_dconv_add
(
int64_t
*
x_dim_padded
,
w = inputs[14].data_ptr<at::Half>();
int64_t
*
pad
,
at::Half* dy_conv4 = inputs[11].data_ptr<at::Half>();
int64_t
*
convstride
,
if (requires_grad) {
int64_t
*
dilation
,
run_dconv(dimA,
int64_t
*
w_dim_padded
,
padA,
int64_t
*
y_dim_padded
,
convstride1X1,
cudnnDataType_t
dataType
,
dilationA,
at
::
Half
*
devPtrX
,
filterdimA4,
at
::
Half
*
devPtrW
,
outdimA3,
at
::
Half
*
devPtrY
,
CUDNN_DATA_HALF,
at
::
Half
*
devPtrR
)
{
dx_conv4,
w,
dy_conv4,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);
// we don't print here since we can't hook out this grad in pytorch alone to compare, due to addition with dx
// DEBUG_MSG("[DEBUG] new dx_identity : " << grad_x_conv4.to(at::kFloat).sum().item<float>());
}
// wgrad
wgrad4 = at::empty_like(inputs[14]);
at::Half* dw4 = wgrad4.data_ptr<at::Half>();
run_dconv(dimA,
padA,
convstride1X1,
dilationA,
filterdimA4,
outdimA3,
CUDNN_DATA_HALF,
x,
dw4,
dy_conv4,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
}
else {
// if there is no downsample, dx_conv4 is fork of drelu3
dx_conv4 = inputs[11].data_ptr<at::Half>();
}
// dconv1+add
// wgrad
auto wgrad1 = at::empty_like(inputs[1]);
at::Half* dw1 = wgrad1.data_ptr<at::Half>();
run_dconv(dimA,
padA,
convstride1X1,
dilationA,
filterdimA1,
outdimA1,
CUDNN_DATA_HALF,
x,
dw1,
dy1,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
// dgrad
w = inputs[1].data_ptr<at::Half>();
auto grad_x = at::empty_like(inputs[0]);
at::Half* dx = grad_x.data_ptr<at::Half>();
// backward strided conv cannot be fused
// if stride == 1 but channel changes, we can fuse here
if (requires_grad){
if (stride_1X1 != 1){
run_dconv(dimA,
padA,
convstride1X1,
dilationA,
filterdimA1,
outdimA1,
CUDNN_DATA_HALF,
dx,
w,
dy1,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);
// add 2 together
grad_x.add_(grad_x_conv4);
}
else {
run_dconv_add(dimA,
padA,
convstride1X1,
dilationA,
filterdimA1,
outdimA1,
CUDNN_DATA_HALF,
dx,
w,
dy1,
dx_conv4);
}
}
DEBUG_MSG("[DEBUG] new dx : " << grad_x.to(at::kFloat).sum().item<float>());
DEBUG_MSG("[DEBUG] new wgrad1 : " << wgrad1.to(at::kFloat).sum().item<float>());
DEBUG_MSG("[DEBUG] new wgrad2 : " << wgrad2.to(at::kFloat).sum().item<float>());
DEBUG_MSG("[DEBUG] new wgrad3 : " << wgrad3.to(at::kFloat).sum().item<float>());
outputs.push_back(grad_x);
outputs.push_back(wgrad1);
outputs.push_back(wgrad2);
outputs.push_back(wgrad3);
if (stride_1X1 != 1 || filterdimA3[0] != dimA[1]) {
DEBUG_MSG("[DEBUG] new wgrad4 : " << wgrad4.to(at::kFloat).sum().item<float>());
outputs.push_back(wgrad4);
}
return outputs;
}
namespace {
enum {
X_TENSOR,
Y_TENSOR,
W_TENSOR,
Z_TENSOR,
B_TENSOR,
AFTERADD_TENSOR,
AFTERBIAS_TENSOR,
AFTERCONV_TENSOR,
OPTIONAL,
AFTEROPT_TENSOR,
AFTERACT_TENSOR,
GEN_INDEX_TENSOR,
MASK_TOP_TENSOR,
MASK_BOTTOM_TENSOR,
MASK_TENSOR,
THRESHOLD_TOP_TENSOR,
THRESHOLD_BOTTOM_TENSOR,
};
using masked_convbias_descriptors = std::tuple<cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor>;
masked_convbias_descriptors
create_conv_bias_add_act_mask_descriptors(int64_t* x_dim_padded,
int64_t* padA,
int64_t* convstrideA,
int64_t* dilationA,
int64_t* w_dim_padded,
int64_t* y_dim_padded,
int64_t* threshold_dim,
cudnnDataType_t dataType) {
const int convDim = 2;
int64_t b_dim_padded[4];
b_dim_padded[0] = 1;
b_dim_padded[1] = y_dim_padded[1];
b_dim_padded[2] = 1;
b_dim_padded[3] = 1;
int64_t x_stride_padded[4];
int64_t y_stride_padded[4];
int64_t w_stride_padded[4];
int64_t b_stride_padded[4];
int64_t threshold_stride[4];
generateStrides(w_dim_padded, w_stride_padded, 4, CUDNN_TENSOR_NHWC);
generateStrides(x_dim_padded, x_stride_padded, 4, CUDNN_TENSOR_NHWC);
generateStrides(y_dim_padded, y_stride_padded, 4, CUDNN_TENSOR_NHWC);
generateStrides(b_dim_padded, b_stride_padded, 4, CUDNN_TENSOR_NHWC);
generateStrides(threshold_dim, threshold_stride, 4, CUDNN_TENSOR_NHWC);
return masked_convbias_descriptors(cudnn_frontend::TensorBuilder()
.setDim(4, x_dim_padded)
.setStrides(4, x_stride_padded)
.setId('x')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setId('y')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, w_dim_padded)
.setStrides(4, w_stride_padded)
.setId('w')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, b_dim_padded)
.setStrides(4, b_stride_padded)
.setId('z')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, b_dim_padded)
.setStrides(4, b_stride_padded)
.setId('b')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setVirtual()
.setId('A') // after add
.setAlignment(16)
.setDataType(CUDNN_DATA_FLOAT)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setVirtual()
.setId('B') // after bias
.setAlignment(16)
.setDataType(CUDNN_DATA_FLOAT)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setId('C') // after conv
.setAlignment(16)
.setVirtual()
.setDataType(CUDNN_DATA_FLOAT)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setId('i')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setId('D') // after optional add
.setAlignment(16)
.setVirtual()
.setDataType(CUDNN_DATA_FLOAT)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setId('E') // after act for masked
.setAlignment(16)
.setVirtual()
.setDataType(CUDNN_DATA_FLOAT)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setId('I') // output of the gen index operation
.setAlignment(16)
.setVirtual()
.setDataType(CUDNN_DATA_INT32)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setId('m') // top half of the mask created after the less than
.setAlignment(16)
.setVirtual()
.setDataType(CUDNN_DATA_BOOLEAN)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setId('n') // bottom half of the mask
.setAlignment(16)
.setVirtual()
.setDataType(CUDNN_DATA_BOOLEAN)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setId('M') // OR of the top and bottom masks
.setAlignment(16)
.setVirtual()
.setDataType(CUDNN_DATA_BOOLEAN)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, threshold_dim)
.setStrides(4, threshold_stride)
.setId('t') // threshold for creating the top mask
.setAlignment(16)
.setDataType(CUDNN_DATA_INT32)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, threshold_dim)
.setStrides(4, threshold_stride)
.setId('u') // threshold for creating the bottom mask
.setAlignment(16)
.setDataType(CUDNN_DATA_INT32)
.build());
}
// tensor descriptors used for dgrad
enum {
X_OR_DX_TENSOR,
DY_TENSOR,
W_OR_DW_TENSOR,
SCALE_TENSOR,
RELU_TENSOR,
AFTER_DCONV_TENSOR,
AFTER_DRELU_TENSOR,
DGRAD_INPUT_TENSOR,
DGRAD_OPTIONAL_TENSOR,
DGRAD_GEN_INDEX_TENSOR,
DGRAD_MASK_TOP_TENSOR,
DGRAD_MASK_BOTTOM_TENSOR,
DGRAD_MASK_TENSOR,
DGRAD_THRESHOLD_TOP_TENSOR,
DGRAD_THRESHOLD_BOTTOM_TENSOR,
};
using dconv_add_descriptors = std::tuple<cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor>;
dconv_add_descriptors
create_dconv_add_descriptors(int64_t* x_dim_padded,
int64_t* padA,
int64_t* convstrideA,
int64_t* dilationA,
int64_t* w_dim_padded,
int64_t* y_dim_padded,
cudnnDataType_t dataType) {
const int convDim = 2;
int64_t b_dim_padded[4];
b_dim_padded[0] = 1;
b_dim_padded[1] = x_dim_padded[1];
b_dim_padded[2] = 1;
b_dim_padded[3] = 1;
int64_t x_stride_padded[4];
int64_t y_stride_padded[4];
int64_t w_stride_padded[4];
int64_t b_stride_padded[4];
generateStrides(w_dim_padded, w_stride_padded, 4, CUDNN_TENSOR_NHWC);
generateStrides(x_dim_padded, x_stride_padded, 4, CUDNN_TENSOR_NHWC);
generateStrides(y_dim_padded, y_stride_padded, 4, CUDNN_TENSOR_NHWC);
generateStrides(b_dim_padded, b_stride_padded, 4, CUDNN_TENSOR_NHWC);
return dconv_add_descriptors(cudnn_frontend::TensorBuilder()
.setDim(4, x_dim_padded)
.setStrides(4, x_stride_padded)
.setId('x')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setId('y')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, w_dim_padded)
.setStrides(4, w_stride_padded)
.setId('w')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, b_dim_padded)
.setStrides(4, b_stride_padded)
.setId('s')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, x_dim_padded)
.setStrides(4, x_stride_padded)
.setId('r')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, x_dim_padded)
.setStrides(4, x_stride_padded)
.setVirtual()
.setId('A') // after dconv
.setAlignment(16)
.setDataType(CUDNN_DATA_FLOAT)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, x_dim_padded)
.setStrides(4, x_stride_padded)
.setVirtual()
.setId('B') // after drelu
.setAlignment(16)
.setDataType(CUDNN_DATA_FLOAT)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setId('i')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setId('D') // after optional add
.setAlignment(16)
.setVirtual()
.setDataType(CUDNN_DATA_FLOAT)
.build());
}
using dconv_mask_descriptors = std::tuple<cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor,
cudnn_frontend::Tensor>;
dconv_mask_descriptors
create_dconv_mask_descriptors(int64_t* x_dim_padded,
int64_t* padA,
int64_t* convstrideA,
int64_t* dilationA,
int64_t* w_dim_padded,
int64_t* y_dim_padded,
int64_t* threshold_dim,
cudnnDataType_t dataType) {
const int convDim = 2;
int64_t b_dim_padded[4];
b_dim_padded[0] = 1;
b_dim_padded[1] = x_dim_padded[1];
b_dim_padded[2] = 1;
b_dim_padded[3] = 1;
int64_t x_stride_padded[4];
int64_t y_stride_padded[4];
int64_t w_stride_padded[4];
int64_t b_stride_padded[4];
int64_t threshold_stride[4];
generateStrides(w_dim_padded, w_stride_padded, 4, CUDNN_TENSOR_NHWC);
generateStrides(x_dim_padded, x_stride_padded, 4, CUDNN_TENSOR_NHWC);
generateStrides(y_dim_padded, y_stride_padded, 4, CUDNN_TENSOR_NHWC);
generateStrides(b_dim_padded, b_stride_padded, 4, CUDNN_TENSOR_NHWC);
generateStrides(threshold_dim, threshold_stride, 4, CUDNN_TENSOR_NHWC);
return dconv_mask_descriptors(cudnn_frontend::TensorBuilder()
.setDim(4, x_dim_padded)
.setStrides(4, x_stride_padded)
.setId('x')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setId('y')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, w_dim_padded)
.setStrides(4, w_stride_padded)
.setId('w')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, b_dim_padded)
.setStrides(4, b_stride_padded)
.setId('s')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, x_dim_padded)
.setStrides(4, x_stride_padded)
.setId('r')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, x_dim_padded)
.setStrides(4, x_stride_padded)
.setVirtual()
.setId('A') // after dconv
.setAlignment(16)
.setDataType(CUDNN_DATA_FLOAT)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, x_dim_padded)
.setStrides(4, x_stride_padded)
.setVirtual()
.setId('B') // after drelu
.setAlignment(16)
.setDataType(CUDNN_DATA_FLOAT)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setId('i')
.setAlignment(16)
.setDataType(dataType)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setId('D') // after optional add
.setAlignment(16)
.setVirtual()
.setDataType(CUDNN_DATA_FLOAT)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setId('I') // output of the gen index operation
.setAlignment(16)
.setVirtual()
.setDataType(CUDNN_DATA_INT32)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setId('m') // top half of the mask created after the less than
.setAlignment(16)
.setVirtual()
.setDataType(CUDNN_DATA_BOOLEAN)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setId('n') // bottom half of the mask
.setAlignment(16)
.setVirtual()
.setDataType(CUDNN_DATA_BOOLEAN)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, y_dim_padded)
.setStrides(4, y_stride_padded)
.setId('M') // OR of the top and bottom masks
.setAlignment(16)
.setVirtual()
.setDataType(CUDNN_DATA_BOOLEAN)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, threshold_dim)
.setStrides(4, threshold_stride)
.setId('t') // threshold for creating the top mask
.setAlignment(16)
.setDataType(CUDNN_DATA_INT32)
.build(),
cudnn_frontend::TensorBuilder()
.setDim(4, threshold_dim)
.setStrides(4, threshold_stride)
.setId('u') // threshold for creating the bottom mask
.setAlignment(16)
.setDataType(CUDNN_DATA_INT32)
.build());
}
void
run_conv_add_scale_bias_activation(int64_t* x_dim_padded,
int64_t* pad,
int64_t* convstride,
int64_t* dilation,
int64_t* w_dim_padded,
int64_t* y_dim_padded,
cudnnDataType_t dataType,
at::Half* devPtrX,
at::Half* devPtrW,
at::Half* devPtrY,
at::Half* devPtrZ,
at::Half* devPtrB,
at::Half* devPtrI) {
cudnnHandle_t handle_ = torch::native::getCudnnHandle();
std::stringstream log_buf;
try {
int convDim = 2;
// Creates the necessary tensor descriptors
common_convbias_descriptors tensors = create_conv_bias_add_act_descriptors(
x_dim_padded, pad, convstride, dilation, w_dim_padded, y_dim_padded, dataType);
DEBUG_CUDNN_MSG(log_buf, std::get<X_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<Y_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<W_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<Z_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<B_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<AFTERADD_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<AFTERBIAS_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<AFTERCONV_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<OPTIONAL>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<AFTEROPT_TENSOR>(tensors).describe());
// Define the add operation
auto scaleDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_MUL)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
DEBUG_CUDNN_MSG(log_buf, scaleDesc.describe());
// Define the bias operation
auto biasDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_ADD)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
DEBUG_CUDNN_MSG(log_buf, biasDesc.describe());
// optional add
auto addDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_ADD)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
DEBUG_CUDNN_MSG(log_buf, addDesc.describe());
// Define the activation operation
auto actDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_RELU_FWD)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
DEBUG_CUDNN_MSG(log_buf, actDesc.describe());
// Define the convolution problem
auto convDesc = cudnn_frontend::ConvDescBuilder()
.setDataType(CUDNN_DATA_FLOAT)
.setMathMode(CUDNN_CROSS_CORRELATION)
.setNDims(convDim)
.setStrides(convDim, convstride)
.setPrePadding(convDim, pad)
.setPostPadding(convDim, pad)
.setDilation(convDim, dilation)
.build();
DEBUG_CUDNN_MSG(log_buf, convDesc.describe());
float alpha = 1.0f;
float beta = 0.0f;
// Create a convolution Node
auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
.setxDesc(std::get<X_TENSOR>(tensors))
.setwDesc(std::get<W_TENSOR>(tensors))
.setyDesc(std::get<AFTERCONV_TENSOR>(tensors))
.setcDesc(convDesc)
.setAlpha(alpha)
.setBeta(beta)
.build();
DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
// create an add node.
auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.setxDesc(conv_op.getOutputTensor())
.setbDesc(std::get<OPTIONAL>(tensors))
.setyDesc(std::get<AFTEROPT_TENSOR>(tensors))
.setpwDesc(addDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, add_op.describe());
// Create a Add Node with scaling parameters.
auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.setxDesc(add_op.getOutputTensor())
.setbDesc(std::get<Z_TENSOR>(tensors))
.setyDesc(std::get<AFTERADD_TENSOR>(tensors))
.setpwDesc(scaleDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, scale_op.describe());
// Create a Bias Node.
auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.setxDesc(scale_op.getOutputTensor())
.setbDesc(std::get<B_TENSOR>(tensors))
.setyDesc(std::get<AFTERBIAS_TENSOR>(tensors))
.setpwDesc(biasDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, bias_op.describe());
// Create an Activation Node.
auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.setxDesc(bias_op.getOutputTensor())
.setyDesc(std::get<Y_TENSOR>(tensors))
.setpwDesc(actDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, act_op.describe());
// Create an Operation Graph. In this case it is convolution add bias activation
std::array<cudnn_frontend::Operation const*, 5> ops = {&conv_op, &add_op, &scale_op, &bias_op, &act_op};
auto opGraph = cudnn_frontend::OperationGraphBuilder()
.setHandle(handle_)
.setOperationGraph(ops.size(), ops.data())
.build();
// Create string encoding for plan caching
auto cache_string = getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);
auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());
auto workspace_size = plan.getWorkspaceSize();
DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);
void* workspace_ptr = nullptr;
auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
if (workspace_size > 0) {
workspace_ptr = workspace_tensor.data_ptr<float>();
}
void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrZ, devPtrB, devPtrI};
int64_t uids[] = {'x', 'y', 'w', 'z', 'b', 'i'};
auto variantPack = cudnn_frontend::VariantPackBuilder()
.setWorkspacePointer(workspace_ptr)
.setDataPointers(6, data_ptrs)
.setUids(6, uids)
.build();
DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
checkCudnnErr(status);
cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
} catch (cudnn_frontend::cudnnException e) {
std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
}
}
void
run_conv_scale_bias_add_activation_mask(int64_t* x_dim_padded,
int64_t* pad,
int64_t* convstride,
int64_t* dilation,
int64_t* w_dim_padded,
int64_t* y_dim_padded,
int64_t* threshold_dim,
cudnnDataType_t dataType,
at::Half* devPtrX,
at::Half* devPtrW,
at::Half* devPtrY,
at::Half* devPtrZ,
at::Half* devPtrB,
at::Half* devPtrI,
int* devPtrT,
int* devPtrU,
int axis) {
cudnnHandle_t handle_ = torch::native::getCudnnHandle();
std::stringstream log_buf;
try {
int convDim = 2;
// Creates the necessary tensor descriptors
masked_convbias_descriptors tensors = create_conv_bias_add_act_mask_descriptors(
x_dim_padded, pad, convstride, dilation, w_dim_padded, y_dim_padded, threshold_dim, dataType);
DEBUG_CUDNN_MSG(log_buf, std::get<X_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<Y_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<W_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<Z_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<B_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<AFTERADD_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<AFTERBIAS_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<AFTERCONV_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<OPTIONAL>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<AFTERACT_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<GEN_INDEX_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<MASK_TOP_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<MASK_BOTTOM_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<MASK_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<THRESHOLD_TOP_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<THRESHOLD_BOTTOM_TENSOR>(tensors).describe());
// Define the add operation
auto scaleDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_MUL)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
DEBUG_CUDNN_MSG(log_buf, scaleDesc.describe());
// Define the bias operation
auto biasDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_ADD)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
DEBUG_CUDNN_MSG(log_buf, biasDesc.describe());
// optional add
auto addDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_ADD)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
DEBUG_CUDNN_MSG(log_buf, addDesc.describe());
// Define the activation operation
auto actDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_RELU_FWD)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
DEBUG_CUDNN_MSG(log_buf, actDesc.describe());
// Define the convolution problem
auto convDesc = cudnn_frontend::ConvDescBuilder()
.setDataType(CUDNN_DATA_FLOAT)
.setMathMode(CUDNN_CROSS_CORRELATION)
.setNDims(convDim)
.setStrides(convDim, convstride)
.setPrePadding(convDim, pad)
.setPostPadding(convDim, pad)
.setDilation(convDim, dilation)
.build();
DEBUG_CUDNN_MSG(log_buf, convDesc.describe());
// Define the genIndex descriptor
auto genIndexDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_GEN_INDEX)
.setMathPrecision(CUDNN_DATA_FLOAT)
.setAxis(axis)
.build();
DEBUG_CUDNN_MSG(log_buf, genIndexDesc.describe());
// Define the lessThan descriptor
auto lessThanDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_CMP_LT)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
DEBUG_CUDNN_MSG(log_buf, lessThanDesc.describe());
// Define the greaterThan descriptor
auto greaterThanDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_CMP_GT)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
DEBUG_CUDNN_MSG(log_buf, greaterThanDesc.describe());
// Define the logical_or descriptor
auto logicalOrDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_LOGICAL_OR)
.setMathPrecision(CUDNN_DATA_BOOLEAN)
.build();
DEBUG_CUDNN_MSG(log_buf, logicalOrDesc.describe());
// Define the binary_selection descriptor
auto selectionDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_BINARY_SELECT)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
DEBUG_CUDNN_MSG(log_buf, selectionDesc.describe());
float alpha = 1.0f;
float beta = 0.0f;
// Create a convolution Node
auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
.setxDesc(std::get<X_TENSOR>(tensors))
.setwDesc(std::get<W_TENSOR>(tensors))
.setyDesc(std::get<AFTERCONV_TENSOR>(tensors))
.setcDesc(convDesc)
.setAlpha(alpha)
.setBeta(beta)
.build();
DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
// Create a Add Node with scaling parameters.
auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.setxDesc(conv_op.getOutputTensor())
.setbDesc(std::get<Z_TENSOR>(tensors))
.setyDesc(std::get<AFTERADD_TENSOR>(tensors))
.setpwDesc(scaleDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, scale_op.describe());
// Create a Bias Node.
auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.setxDesc(scale_op.getOutputTensor())
.setbDesc(std::get<B_TENSOR>(tensors))
.setyDesc(std::get<AFTERBIAS_TENSOR>(tensors))
.setpwDesc(biasDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, bias_op.describe());
// Create a optional add Node.
auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.setxDesc(bias_op.getOutputTensor())
.setbDesc(std::get<OPTIONAL>(tensors))
.setyDesc(std::get<AFTEROPT_TENSOR>(tensors))
.setpwDesc(addDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, add_op.describe());
// Create an Activation Node.
auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.setxDesc(devPtrI ? add_op.getOutputTensor() : bias_op.getOutputTensor())
.setyDesc(std::get<AFTERACT_TENSOR>(tensors))
.setpwDesc(actDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, act_op.describe());
// Create a Gen_Index Node.
auto genIndex_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.setxDesc(std::get<AFTERACT_TENSOR>(tensors))
.setyDesc(std::get<GEN_INDEX_TENSOR>(tensors))
.setpwDesc(genIndexDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, genIndex_op.describe());
// Create a LessThan Node.
auto lessThan_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.setxDesc(std::get<GEN_INDEX_TENSOR>(tensors))
.setbDesc(std::get<THRESHOLD_TOP_TENSOR>(tensors))
.setyDesc(std::get<MASK_TOP_TENSOR>(tensors))
.setpwDesc(lessThanDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, lessThan_op.describe());
// Create a GreaterThan Node.
auto greaterThan_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.setxDesc(std::get<GEN_INDEX_TENSOR>(tensors))
.setbDesc(std::get<THRESHOLD_BOTTOM_TENSOR>(tensors))
.setyDesc(std::get<MASK_BOTTOM_TENSOR>(tensors))
.setpwDesc(greaterThanDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, greaterThan_op.describe());
// Create a LogicalOr Node.
auto logicalOr_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.setxDesc(std::get<MASK_TOP_TENSOR>(tensors))
.setbDesc(std::get<MASK_BOTTOM_TENSOR>(tensors))
.setyDesc(std::get<MASK_TENSOR>(tensors))
.setpwDesc(logicalOrDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, logicalOr_op.describe());
// Create a Binary_Selection Node.
auto selection_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.setxDesc(std::get<AFTERCONV_TENSOR>(tensors))
.setbDesc(std::get<AFTERACT_TENSOR>(tensors))
.settDesc(std::get<MASK_TENSOR>(tensors))
.setyDesc(std::get<Y_TENSOR>(tensors))
.setpwDesc(selectionDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, selection_op.describe());
// Create an Operation Graph. In this case it is convolution add bias activation
if (devPtrI) {
std::array<cudnn_frontend::Operation const*, 10> ops = {&conv_op, &scale_op, &bias_op, &add_op, &act_op, &genIndex_op, &lessThan_op, &greaterThan_op, &logicalOr_op, &selection_op};
auto opGraph = cudnn_frontend::OperationGraphBuilder()
.setHandle(handle_)
.setOperationGraph(ops.size(), ops.data())
.build();
// Create string encoding for plan caching
auto cache_string = getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);
auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());
auto workspace_size = plan.getWorkspaceSize();
DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);
void* workspace_ptr = nullptr;
auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
if (workspace_size > 0) {
workspace_ptr = workspace_tensor.data_ptr<float>();
}
void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrZ, devPtrB, devPtrI, devPtrT, devPtrU};
int64_t uids[] = {'x', 'y', 'w', 'z', 'b', 'i', 't', 'u'};
auto variantPack = cudnn_frontend::VariantPackBuilder()
.setWorkspacePointer(workspace_ptr)
.setDataPointers(8, data_ptrs)
.setUids(8, uids)
.build();
DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
checkCudnnErr(status);
cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
} else {
std::array<cudnn_frontend::Operation const*, 9> ops = {&conv_op, &scale_op, &bias_op, &act_op, &genIndex_op, &lessThan_op, &greaterThan_op, &logicalOr_op, &selection_op};
auto opGraph = cudnn_frontend::OperationGraphBuilder()
.setHandle(handle_)
.setOperationGraph(ops.size(), ops.data())
.build();
// Create string encoding for plan caching
auto cache_string = getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);
auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());
auto workspace_size = plan.getWorkspaceSize();
DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);
void* workspace_ptr = nullptr;
auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
if (workspace_size > 0) {
workspace_ptr = workspace_tensor.data_ptr<float>();
}
void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrZ, devPtrB, devPtrT, devPtrU};
int64_t uids[] = {'x', 'y', 'w', 'z', 'b', 't', 'u'};
auto variantPack = cudnn_frontend::VariantPackBuilder()
.setWorkspacePointer(workspace_ptr)
.setDataPointers(7, data_ptrs)
.setUids(7, uids)
.build();
DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
checkCudnnErr(status);
cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
}
} catch (cudnn_frontend::cudnnException e) {
std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
}
}
void
run_dconv_add_drelu_dscale(int64_t* x_dim_padded,
int64_t* pad,
int64_t* convstride,
int64_t* dilation,
int64_t* w_dim_padded,
int64_t* y_dim_padded,
cudnnDataType_t dataType,
at::Half* devPtrX,
at::Half* devPtrW,
at::Half* devPtrY,
at::Half* devPtrZ,
at::Half* devPtrR,
at::Half* devPtrI) {
cudnnHandle_t handle_ = torch::native::getCudnnHandle();
cudnnHandle_t handle_ = torch::native::getCudnnHandle();
std::stringstream log_buf;
std::stringstream log_buf;
try {
try {
int convDim = 2;
int convDim = 2;
// Creates the necessary tensor descriptors
// Creates the necessary tensor descriptors
dconv_descriptors
tensors
=
create_dconv_descriptors
(
dconv_
add_
descriptors tensors = create_dconv_
add_
descriptors(
x_dim_padded, pad, convstride, dilation, w_dim_padded, y_dim_padded, dataType);
x_dim_padded, pad, convstride, dilation, w_dim_padded, y_dim_padded, dataType);
DEBUG_CUDNN_MSG(log_buf, std::get<X_OR_DX_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<X_OR_DX_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<DY_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<DY_TENSOR>(tensors).describe());
...
@@ -2201,6 +2568,8 @@ run_dconv_add(int64_t* x_dim_padded,
...
@@ -2201,6 +2568,8 @@ run_dconv_add(int64_t* x_dim_padded,
DEBUG_CUDNN_MSG(log_buf, std::get<RELU_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<RELU_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<AFTER_DCONV_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<AFTER_DCONV_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<AFTER_DRELU_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<AFTER_DRELU_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<DGRAD_INPUT_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<DGRAD_OPTIONAL_TENSOR>(tensors).describe());
// Define the convolution problem
// Define the convolution problem
auto convDesc = cudnn_frontend::ConvDescBuilder()
auto convDesc = cudnn_frontend::ConvDescBuilder()
...
@@ -2214,12 +2583,26 @@ run_dconv_add(int64_t* x_dim_padded,
...
@@ -2214,12 +2583,26 @@ run_dconv_add(int64_t* x_dim_padded,
.build();
.build();
DEBUG_CUDNN_MSG(log_buf, convDesc.describe());
DEBUG_CUDNN_MSG(log_buf, convDesc.describe());
//
Define the add backward operation
//
optional add
auto addDesc = cudnn_frontend::PointWiseDescBuilder()
auto addDesc = cudnn_frontend::PointWiseDescBuilder()
.
setMode
(
CUDNN_POINTWISE_ADD
)
.setMode(CUDNN_POINTWISE_ADD)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
DEBUG_CUDNN_MSG(log_buf, addDesc.describe());
// Define the activation backward operation
auto actDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_RELU_BWD)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
DEBUG_CUDNN_MSG(log_buf, actDesc.describe());
// Define the scale backward operation
auto scaleDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_MUL)
.setMathPrecision(CUDNN_DATA_FLOAT)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
.build();
DEBUG_CUDNN_MSG
(
log_buf
,
add
Desc
.
describe
());
DEBUG_CUDNN_MSG(log_buf,
scale
Desc.describe());
float alpha = 1.0f;
float alpha = 1.0f;
float beta = 0.0f;
float beta = 0.0f;
...
@@ -2235,564 +2618,292 @@ run_dconv_add(int64_t* x_dim_padded,
...
@@ -2235,564 +2618,292 @@ run_dconv_add(int64_t* x_dim_padded,
.build();
.build();
DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
// TODO: do we need getOutputTensor(), and what it returns in backward case?
// Create add Node.
// Create add Node.
auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.
setxDesc
(
std
::
get
<
AFTER_DCONV_TENSOR
>
(
tensors
))
.setxDesc(std::get<AFTER_DCONV_TENSOR>(tensors))
.
setbDesc
(
std
::
get
<
RELU
_TENSOR
>
(
tensors
))
.setbDesc(std::get<
DGRAD_INPUT
_TENSOR>(tensors))
.
setyDesc
(
std
::
get
<
X_OR_DX
_TENSOR
>
(
tensors
))
.setyDesc(std::get<
DGRAD_OPTIONAL
_TENSOR>(tensors))
.
setpwDesc
(
addDesc
)
.setpwDesc(addDesc)
.
build
();
.build();
DEBUG_CUDNN_MSG(log_buf, add_op.describe());
DEBUG_CUDNN_MSG(log_buf, add_op.describe());
// Create an Operation Graph. In this case it is convolution add bias activation
// TODO: do we need getOutputTensor(), and what it returns in backward case?
std
::
array
<
cudnn_frontend
::
Operation
const
*
,
2
>
ops
=
{
&
conv_op
,
&
add_op
};
// Create an relu backward Node.
auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
auto
opGraph
=
cudnn_frontend
::
OperationGraphBuilder
()
.setdyDesc(std::get<DGRAD_OPTIONAL_TENSOR>(tensors))
.
setHandle
(
handle_
)
.setxDesc(std::get<RELU_TENSOR>(tensors))
.
setOperationGraph
(
ops
.
size
(),
ops
.
data
())
.setdxDesc(std::get<AFTER_DRELU_TENSOR>(tensors))
.setpwDesc(actDesc)
.build();
.build();
DEBUG_CUDNN_MSG(log_buf, act_op.describe());
// Create string encoding for plan caching
// Create a Scale Node.
auto
cache_string
=
getConvFusionString
(
x_dim_padded
,
pad
,
convstride
,
dilation
,
w_dim_padded
,
dataType
,
opGraph
.
getTag
());
auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
DEBUG_CUDNN_MSG
(
log_buf
,
"[convstring] "
<<
cache_string
);
.setxDesc(std::get<AFTER_DRELU_TENSOR>(tensors))
.setbDesc(std::get<SCALE_TENSOR>(tensors))
auto
&
plan
=
getOrCreatePlan
(
handle_
,
log_buf
,
opGraph
,
cache_string
);
.setyDesc(std::get<X_OR_DX_TENSOR>(tensors))
DEBUG_CUDNN_MSG
(
log_buf
,
"Plan tag: "
<<
plan
.
getTag
());
.setpwDesc(scaleDesc)
auto
workspace_size
=
plan
.
getWorkspaceSize
();
DEBUG_CUDNN_MSG
(
log_buf
,
plan
.
describe
()
<<
" requires workspace "
<<
workspace_size
);
void
*
workspace_ptr
=
nullptr
;
auto
workspace_tensor
=
at
::
empty
({(
workspace_size
+
3
)
/
4
},
at
::
TensorOptions
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
if
(
workspace_size
>
0
)
{
workspace_ptr
=
workspace_tensor
.
data_ptr
<
float
>
();
}
void
*
data_ptrs
[]
=
{
devPtrX
,
devPtrY
,
devPtrW
,
devPtrR
};
int64_t
uids
[]
=
{
'x'
,
'y'
,
'w'
,
'r'
};
auto
variantPack
=
cudnn_frontend
::
VariantPackBuilder
()
.
setWorkspacePointer
(
workspace_ptr
)
.
setDataPointers
(
4
,
data_ptrs
)
.
setUids
(
4
,
uids
)
.build();
.build();
DEBUG_CUDNN_MSG
(
log_buf
,
"variantPack "
<<
variantPack
.
describe
());
DEBUG_CUDNN_MSG(log_buf, scale_op.describe());
cudnnStatus_t
status
=
cudnnBackendExecute
(
handle_
,
plan
.
get_raw_desc
(),
variantPack
.
get_raw_desc
());
checkCudnnErr
(
status
);
cudnn_frontend
::
throw_if
([
status
]()
{
return
(
status
!=
CUDNN_STATUS_SUCCESS
);
},
"Plan execute error"
,
status
);
}
catch
(
cudnn_frontend
::
cudnnException
e
)
{
std
::
cout
<<
log_buf
.
str
()
<<
"[ERROR] Exception "
<<
e
.
what
()
<<
std
::
endl
;
}
}
// inputs contains x,w,z,b,(i)
std
::
vector
<
at
::
Tensor
>
bottleneck_forward
(
bool
explicit_nhwc
,
int
stride_1X1
,
std
::
vector
<
at
::
Tensor
>
inputs
)
{
std
::
cout
<<
std
::
fixed
;
// create output vector
std
::
vector
<
at
::
Tensor
>
outputs
;
auto
output_format
=
explicit_nhwc
?
at
::
MemoryFormat
::
Contiguous
:
at
::
MemoryFormat
::
ChannelsLast
;
// setup dimensions
int64_t
dimA
[]
=
{
0
,
0
,
0
,
0
};
int64_t
filterdimA1
[]
=
{
0
,
0
,
0
,
0
};
int64_t
filterdimA2
[]
=
{
0
,
0
,
0
,
0
};
int64_t
filterdimA3
[]
=
{
0
,
0
,
0
,
0
};
int64_t
filterdimA4
[]
=
{
0
,
0
,
0
,
0
};
// All dim calculation after this order of n,c,h,w
int
axis
[]
{
0
,
1
,
2
,
3
};
if
(
explicit_nhwc
)
{
axis
[
0
]
=
0
;
axis
[
1
]
=
3
;
axis
[
2
]
=
1
;
axis
[
3
]
=
2
;
}
for
(
int
dim
=
0
;
dim
<
4
;
dim
++
)
{
dimA
[
dim
]
=
inputs
[
0
].
size
(
axis
[
dim
]);
filterdimA1
[
dim
]
=
inputs
[
1
].
size
(
axis
[
dim
]);
filterdimA2
[
dim
]
=
inputs
[
2
].
size
(
axis
[
dim
]);
filterdimA3
[
dim
]
=
inputs
[
3
].
size
(
axis
[
dim
]);
}
if
(
stride_1X1
!=
1
||
filterdimA3
[
0
]
!=
dimA
[
1
])
{
for
(
int
dim
=
0
;
dim
<
4
;
dim
++
)
{
filterdimA4
[
dim
]
=
inputs
[
10
].
size
(
axis
[
dim
]);
}
}
// output dim in n,c,h,w used by backend
int64_t
outdimA1
[]
=
{
0
,
0
,
0
,
0
};
// Computed Below
int64_t
outdimA2
[]
=
{
0
,
0
,
0
,
0
};
// Computed Below
int64_t
outdimA3
[]
=
{
0
,
0
,
0
,
0
};
// Computed Below
// use these fixed value for test run
int64_t
padA
[]
=
{
0
,
0
};
int64_t
padA1
[]
=
{
1
,
1
};
int64_t
dilationA
[]
=
{
1
,
1
};
int64_t
convstrideA
[]
=
{
1
,
1
};
int64_t
convstride1X1
[]
=
{
stride_1X1
,
stride_1X1
};
// compute output from pad/stride/dilation
outdimA1
[
0
]
=
dimA
[
0
];
outdimA1
[
1
]
=
filterdimA1
[
0
];
for
(
int
dim
=
0
;
dim
<
2
;
dim
++
)
{
outdimA1
[
dim
+
2
]
=
getFwdConvOutputDim
(
dimA
[
dim
+
2
],
padA
[
dim
],
filterdimA1
[
dim
+
2
],
convstride1X1
[
dim
],
dilationA
[
dim
]);
}
outdimA2
[
0
]
=
outdimA1
[
0
];
outdimA2
[
1
]
=
filterdimA2
[
0
];
for
(
int
dim
=
0
;
dim
<
2
;
dim
++
)
{
outdimA2
[
dim
+
2
]
=
getFwdConvOutputDim
(
outdimA1
[
dim
+
2
],
padA1
[
dim
],
filterdimA2
[
dim
+
2
],
convstrideA
[
dim
],
dilationA
[
dim
]);
}
outdimA3
[
0
]
=
outdimA2
[
0
];
outdimA3
[
1
]
=
filterdimA3
[
0
];
for
(
int
dim
=
0
;
dim
<
2
;
dim
++
)
{
outdimA3
[
dim
+
2
]
=
getFwdConvOutputDim
(
outdimA2
[
dim
+
2
],
padA
[
dim
],
filterdimA3
[
dim
+
2
],
convstrideA
[
dim
],
dilationA
[
dim
]);
}
// Create output tensor in the correct shape in pytorch's view
int64_t
outdim1
[]
=
{
0
,
0
,
0
,
0
};
int64_t
outdim2
[]
=
{
0
,
0
,
0
,
0
};
int64_t
outdim3
[]
=
{
0
,
0
,
0
,
0
};
if
(
explicit_nhwc
)
{
axis
[
0
]
=
0
;
axis
[
1
]
=
2
;
axis
[
2
]
=
3
;
axis
[
3
]
=
1
;
}
for
(
int
dim
=
0
;
dim
<
4
;
dim
++
)
{
outdim1
[
dim
]
=
outdimA1
[
axis
[
dim
]];
outdim2
[
dim
]
=
outdimA2
[
axis
[
dim
]];
outdim3
[
dim
]
=
outdimA3
[
axis
[
dim
]];
}
// run
at
::
Half
*
x
=
inputs
[
0
].
data_ptr
<
at
::
Half
>
();
at
::
Half
*
w
=
inputs
[
1
].
data_ptr
<
at
::
Half
>
();
at
::
Half
*
z
=
inputs
[
4
].
data_ptr
<
at
::
Half
>
();
at
::
Half
*
b
=
inputs
[
7
].
data_ptr
<
at
::
Half
>
();
auto
out1
=
at
::
empty
(
outdim1
,
inputs
[
0
].
type
(),
output_format
);
at
::
Half
*
y1
=
out1
.
data_ptr
<
at
::
Half
>
();
run_conv_scale_bias_add_activation
(
dimA
,
padA
,
convstride1X1
,
dilationA
,
filterdimA1
,
outdimA1
,
CUDNN_DATA_HALF
,
x
,
w
,
y1
,
z
,
b
,
nullptr
);
DEBUG_MSG
(
"[DEBUG] new relu1 : "
<<
out1
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
w
=
inputs
[
2
].
data_ptr
<
at
::
Half
>
();
z
=
inputs
[
5
].
data_ptr
<
at
::
Half
>
();
b
=
inputs
[
8
].
data_ptr
<
at
::
Half
>
();
auto
out2
=
at
::
empty
(
outdim2
,
inputs
[
0
].
type
(),
output_format
);
at
::
Half
*
y2
=
out2
.
data_ptr
<
at
::
Half
>
();
run_conv_scale_bias_add_activation
(
outdimA1
,
padA1
,
convstrideA
,
dilationA
,
filterdimA2
,
outdimA2
,
CUDNN_DATA_HALF
,
y1
,
w
,
y2
,
z
,
b
,
nullptr
);
DEBUG_MSG
(
"[DEBUG] new relu2 : "
<<
out2
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
// create output of conv3
auto
out3
=
at
::
empty
(
outdim3
,
inputs
[
0
].
type
(),
output_format
);
at
::
Half
*
y3
=
out3
.
data_ptr
<
at
::
Half
>
();
// create output of conv4 that may exist
auto
identity
=
at
::
empty_like
(
out3
);
at
::
Half
*
yi
=
identity
.
data_ptr
<
at
::
Half
>
();
if
(
stride_1X1
!=
1
||
filterdimA3
[
0
]
!=
dimA
[
1
]){
w
=
inputs
[
10
].
data_ptr
<
at
::
Half
>
();
z
=
inputs
[
11
].
data_ptr
<
at
::
Half
>
();
b
=
inputs
[
12
].
data_ptr
<
at
::
Half
>
();
run_conv_scale_bias
(
dimA
,
padA
,
convstride1X1
,
dilationA
,
filterdimA4
,
outdimA3
,
CUDNN_DATA_HALF
,
x
,
w
,
yi
,
z
,
b
);
DEBUG_MSG
(
"[DEBUG] new downsample : "
<<
identity
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
}
else
{
yi
=
x
;
}
w
=
inputs
[
3
].
data_ptr
<
at
::
Half
>
();
z
=
inputs
[
6
].
data_ptr
<
at
::
Half
>
();
b
=
inputs
[
9
].
data_ptr
<
at
::
Half
>
();
run_conv_scale_bias_add_activation
(
outdimA2
,
padA
,
convstrideA
,
dilationA
,
filterdimA3
,
outdimA3
,
CUDNN_DATA_HALF
,
y2
,
w
,
y3
,
z
,
b
,
yi
);
DEBUG_MSG
(
"[DEBUG] new relu3 : "
<<
out3
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
outputs
.
push_back
(
out1
);
outputs
.
push_back
(
out2
);
outputs
.
push_back
(
out3
);
return
outputs
;
}
std
::
vector
<
at
::
Tensor
>
bottleneck_backward
(
bool
explicit_nhwc
,
int
stride_1X1
,
std
::
vector
<
at
::
Tensor
>
inputs
)
{
bool
requires_grad
=
inputs
[
0
].
requires_grad
();
std
::
cout
<<
std
::
fixed
;
// create output vector
std
::
vector
<
at
::
Tensor
>
outputs
;
auto
output_format
=
explicit_nhwc
?
at
::
MemoryFormat
::
Contiguous
:
at
::
MemoryFormat
::
ChannelsLast
;
// setup dimensions
// Create an Operation Graph. In this case it is convolution add bias activation
int64_t
dimA
[]
=
{
0
,
0
,
0
,
0
};
std::array<cudnn_frontend::Operation const*, 4> ops = {&conv_op, &add_op, &act_op, &scale_op};
int64_t
filterdimA1
[]
=
{
0
,
0
,
0
,
0
};
int64_t
filterdimA2
[]
=
{
0
,
0
,
0
,
0
};
int64_t
filterdimA3
[]
=
{
0
,
0
,
0
,
0
};
int64_t
filterdimA4
[]
=
{
0
,
0
,
0
,
0
};
// All dim calculation after this order of n,c,h,w
auto opGraph = cudnn_frontend::OperationGraphBuilder()
int
axis
[]
{
0
,
1
,
2
,
3
};
.setHandle(handle_)
if
(
explicit_nhwc
)
{
.setOperationGraph(ops.size(), ops.data())
axis
[
0
]
=
0
;
.build();
axis
[
1
]
=
3
;
axis
[
2
]
=
1
;
axis
[
3
]
=
2
;
}
for
(
int
dim
=
0
;
dim
<
4
;
dim
++
)
{
dimA
[
dim
]
=
inputs
[
0
].
size
(
axis
[
dim
]);
filterdimA1
[
dim
]
=
inputs
[
1
].
size
(
axis
[
dim
]);
filterdimA2
[
dim
]
=
inputs
[
2
].
size
(
axis
[
dim
]);
filterdimA3
[
dim
]
=
inputs
[
3
].
size
(
axis
[
dim
]);
}
if
(
stride_1X1
!=
1
||
filterdimA3
[
0
]
!=
dimA
[
1
])
{
for
(
int
dim
=
0
;
dim
<
4
;
dim
++
)
{
filterdimA4
[
dim
]
=
inputs
[
14
].
size
(
axis
[
dim
]);
}
}
// output dim in n,c,h,w used by backend
// Create string encoding for plan caching
int64_t
outdimA1
[]
=
{
0
,
0
,
0
,
0
};
// Computed Below
auto cache_string = getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
int64_t
outdimA2
[]
=
{
0
,
0
,
0
,
0
};
// Computed Below
DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);
int64_t
outdimA3
[]
=
{
0
,
0
,
0
,
0
};
// Computed Below
// use these fixed value for test run
auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
int64_t
padA
[]
=
{
0
,
0
};
DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());
int64_t
padA1
[]
=
{
1
,
1
};
int64_t
dilationA
[]
=
{
1
,
1
};
int64_t
convstrideA
[]
=
{
1
,
1
};
int64_t
convstride1X1
[]
=
{
stride_1X1
,
stride_1X1
};
// compute output from pad/stride/dilation
auto workspace_size = plan.getWorkspaceSize();
outdimA1
[
0
]
=
dimA
[
0
];
DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);
outdimA1
[
1
]
=
filterdimA1
[
0
];
for
(
int
dim
=
0
;
dim
<
2
;
dim
++
)
{
outdimA1
[
dim
+
2
]
=
getFwdConvOutputDim
(
dimA
[
dim
+
2
],
padA
[
dim
],
filterdimA1
[
dim
+
2
],
convstride1X1
[
dim
],
dilationA
[
dim
]);
}
outdimA2
[
0
]
=
outdimA1
[
0
];
void* workspace_ptr = nullptr;
outdimA2
[
1
]
=
filterdimA2
[
0
];
auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
for
(
int
dim
=
0
;
dim
<
2
;
dim
++
)
{
if (workspace_size > 0) {
outdimA2
[
dim
+
2
]
=
getFwdConvOutputDim
(
outdimA1
[
dim
+
2
],
padA1
[
dim
],
filterdimA2
[
dim
+
2
],
convstrideA
[
dim
],
dilationA
[
dim
]);
workspace_ptr = workspace_tensor.data_ptr<float>();
}
}
void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrZ, devPtrR, devPtrI};
int64_t uids[] = {'x', 'y', 'w', 's', 'r', 'i'};
auto variantPack = cudnn_frontend::VariantPackBuilder()
.setWorkspacePointer(workspace_ptr)
.setDataPointers(6, data_ptrs)
.setUids(6, uids)
.build();
DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
checkCudnnErr(status);
cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
} catch (cudnn_frontend::cudnnException e) {
std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
}
}
outdimA3
[
0
]
=
outdimA2
[
0
];
void
outdimA3
[
1
]
=
filterdimA3
[
0
];
run_dconv_drelu_dscale_mask(int64_t* x_dim_padded,
for
(
int
dim
=
0
;
dim
<
2
;
dim
++
)
{
int64_t* pad,
outdimA3
[
dim
+
2
]
=
getFwdConvOutputDim
(
outdimA2
[
dim
+
2
],
padA
[
dim
],
filterdimA3
[
dim
+
2
],
convstrideA
[
dim
],
dilationA
[
dim
]);
int64_t* convstride,
}
int64_t* dilation,
int64_t* w_dim_padded,
int64_t* y_dim_padded,
int64_t* threshold_dim,
cudnnDataType_t dataType,
at::Half* devPtrX,
at::Half* devPtrW,
at::Half* devPtrY,
at::Half* devPtrZ,
at::Half* devPtrR,
int* devPtrT,
int* devPtrU,
int axis) {
cudnnHandle_t handle_ = torch::native::getCudnnHandle();
std::stringstream log_buf;
try {
int convDim = 2;
// Create output tensor in the correct shape in pytorch's view
// Creates the necessary tensor descriptors
int64_t
outdim1
[]
=
{
0
,
0
,
0
,
0
};
dconv_mask_descriptors tensors = create_dconv_mask_descriptors(
int64_t
outdim2
[]
=
{
0
,
0
,
0
,
0
};
x_dim_padded, pad, convstride, dilation, w_dim_padded, y_dim_padded, threshold_dim, dataType);
int64_t
outdim3
[]
=
{
0
,
0
,
0
,
0
};
DEBUG_CUDNN_MSG(log_buf, std::get<X_OR_DX_TENSOR>(tensors).describe());
if
(
explicit_nhwc
)
{
DEBUG_CUDNN_MSG(log_buf, std::get<DY_TENSOR>(tensors).describe());
axis
[
0
]
=
0
;
DEBUG_CUDNN_MSG(log_buf, std::get<W_OR_DW_TENSOR>(tensors).describe());
axis
[
1
]
=
2
;
DEBUG_CUDNN_MSG(log_buf, std::get<SCALE_TENSOR>(tensors).describe());
axis
[
2
]
=
3
;
DEBUG_CUDNN_MSG(log_buf, std::get<RELU_TENSOR>(tensors).describe());
axis
[
3
]
=
1
;
DEBUG_CUDNN_MSG(log_buf, std::get<AFTER_DCONV_TENSOR>(tensors).describe());
}
DEBUG_CUDNN_MSG(log_buf, std::get<AFTER_DRELU_TENSOR>(tensors).describe());
for
(
int
dim
=
0
;
dim
<
4
;
dim
++
)
{
DEBUG_CUDNN_MSG(log_buf, std::get<DGRAD_OPTIONAL_TENSOR>(tensors).describe());
outdim1
[
dim
]
=
outdimA1
[
axis
[
dim
]];
DEBUG_CUDNN_MSG(log_buf, std::get<DGRAD_GEN_INDEX_TENSOR>(tensors).describe());
outdim2
[
dim
]
=
outdimA2
[
axis
[
dim
]];
DEBUG_CUDNN_MSG(log_buf, std::get<DGRAD_MASK_TOP_TENSOR>(tensors).describe());
outdim3
[
dim
]
=
outdimA3
[
axis
[
dim
]];
DEBUG_CUDNN_MSG(log_buf, std::get<DGRAD_MASK_BOTTOM_TENSOR>(tensors).describe());
}
DEBUG_CUDNN_MSG(log_buf, std::get<DGRAD_MASK_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<DGRAD_THRESHOLD_TOP_TENSOR>(tensors).describe());
DEBUG_CUDNN_MSG(log_buf, std::get<DGRAD_THRESHOLD_BOTTOM_TENSOR>(tensors).describe());
// dconv3+drelu2+dscale2
// Define the convolution problem
at
::
Half
*
conv_in
=
inputs
[
13
].
data_ptr
<
at
::
Half
>
();
auto convDesc = cudnn_frontend::ConvDescBuilder()
at
::
Half
*
dy3
=
inputs
[
10
].
data_ptr
<
at
::
Half
>
();
.setDataType(CUDNN_DATA_FLOAT)
.setMathMode(CUDNN_CROSS_CORRELATION)
.setNDims(convDim)
.setStrides(convDim, convstride)
.setPrePadding(convDim, pad)
.setPostPadding(convDim, pad)
.setDilation(convDim, dilation)
.build();
DEBUG_CUDNN_MSG(log_buf, convDesc.describe());
DEBUG_MSG
(
"[DEBUG] new dconv3 : "
<<
inputs
[
10
].
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
// Define the activation backward operation
auto actDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_RELU_BWD)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
DEBUG_CUDNN_MSG(log_buf, actDesc.describe());
// wgrad
// Define the scale backward operation
auto
wgrad3
=
at
::
empty_like
(
inputs
[
3
]);
auto scaleDesc = cudnn_frontend::PointWiseDescBuilder()
at
::
Half
*
dw3
=
wgrad3
.
data_ptr
<
at
::
Half
>
();
.setMode(CUDNN_POINTWISE_MUL)
run_dconv
(
outdimA2
,
.setMathPrecision(CUDNN_DATA_FLOAT)
padA
,
.build();
convstrideA
,
DEBUG_CUDNN_MSG(log_buf, scaleDesc.describe());
dilationA
,
filterdimA3
,
outdimA3
,
CUDNN_DATA_HALF
,
conv_in
,
dw3
,
dy3
,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
);
// dgrad
// Define the genIndex descriptor
auto
grad_out2
=
at
::
empty
(
outdim2
,
inputs
[
0
].
type
(),
output_format
);
auto genIndexDesc = cudnn_frontend::PointWiseDescBuilder()
at
::
Half
*
dy2
=
grad_out2
.
data_ptr
<
at
::
Half
>
();
.setMode(CUDNN_POINTWISE_GEN_INDEX)
at
::
Half
*
w
=
inputs
[
3
].
data_ptr
<
at
::
Half
>
();
.setMathPrecision(CUDNN_DATA_FLOAT)
at
::
Half
*
z
=
inputs
[
5
].
data_ptr
<
at
::
Half
>
();
.setAxis(axis)
.build();
DEBUG_CUDNN_MSG(log_buf, genIndexDesc.describe());
at
::
Half
*
relu2
=
inputs
[
13
].
data_ptr
<
at
::
Half
>
();
// Define the lessThan descriptor
auto lessThanDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_CMP_LT)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
DEBUG_CUDNN_MSG(log_buf, lessThanDesc.describe());
run_dconv_drelu_dscale
(
outdimA2
,
// Define the greaterThan descriptor
padA
,
auto greaterThanDesc = cudnn_frontend::PointWiseDescBuilder()
convstrideA
,
.setMode(CUDNN_POINTWISE_CMP_GT)
dilationA
,
.setMathPrecision(CUDNN_DATA_FLOAT)
filterdimA3
,
.build();
outdimA3
,
DEBUG_CUDNN_MSG(log_buf, greaterThanDesc.describe());
CUDNN_DATA_HALF
,
dy2
,
w
,
dy3
,
z
,
relu2
);
DEBUG_MSG
(
"[DEBUG] new dconv2 : "
<<
grad_out2
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
// Define the logical_or descriptor
auto logicalOrDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_LOGICAL_OR)
.setMathPrecision(CUDNN_DATA_BOOLEAN)
.build();
DEBUG_CUDNN_MSG(log_buf, logicalOrDesc.describe());
// dconv2+drelu1+dscale1
// Define the binary_selection descriptor
conv_in
=
inputs
[
12
].
data_ptr
<
at
::
Half
>
();
auto selectionDesc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_BINARY_SELECT)
.setMathPrecision(CUDNN_DATA_FLOAT)
.build();
DEBUG_CUDNN_MSG(log_buf, selectionDesc.describe());
// wgrad
float alpha = 1.0f;
auto
wgrad2
=
at
::
empty_like
(
inputs
[
2
]);
float beta = 0.0f;
at
::
Half
*
dw2
=
wgrad2
.
data_ptr
<
at
::
Half
>
();
run_dconv
(
outdimA1
,
// Create a convolution Node
padA1
,
auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR)
convstrideA
,
.setdxDesc(std::get<AFTER_DCONV_TENSOR>(tensors))
dilationA
,
.setwDesc(std::get<W_OR_DW_TENSOR>(tensors))
filterdimA2
,
.setdyDesc(std::get<DY_TENSOR>(tensors))
outdimA2
,
.setcDesc(convDesc)
CUDNN_DATA_HALF
,
.setAlpha(alpha)
conv_in
,
.setBeta(beta)
dw2
,
.build();
dy2
,
DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
);
// dgrad
// TODO: do we need getOutputTensor(), and what it returns in backward case?
auto
grad_out1
=
at
::
empty
(
outdim1
,
inputs
[
0
].
type
(),
output_format
);
// Create an relu backward Node.
at
::
Half
*
dy1
=
grad_out1
.
data_ptr
<
at
::
Half
>
();
auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
w
=
inputs
[
2
].
data_ptr
<
at
::
Half
>
();
.setdyDesc(std::get<AFTER_DCONV_TENSOR>(tensors))
z
=
inputs
[
4
].
data_ptr
<
at
::
Half
>
();
.setxDesc(std::get<RELU_TENSOR>(tensors))
.setdxDesc(std::get<AFTER_DRELU_TENSOR>(tensors))
.setpwDesc(actDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, act_op.describe());
at
::
Half
*
relu1
=
inputs
[
12
].
data_ptr
<
at
::
Half
>
();
// Create a Scale Node.
// fused dgrad
auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
run_dconv_drelu_dscale
(
outdimA1
,
.setxDesc(std::get<AFTER_DRELU_TENSOR>(tensors))
padA1
,
.setbDesc(std::get<SCALE_TENSOR>(tensors))
convstrideA
,
.setyDesc(std::get<DGRAD_OPTIONAL_TENSOR>(tensors))
dilationA
,
.setpwDesc(scaleDesc)
filterdimA2
,
.build();
outdimA2
,
DEBUG_CUDNN_MSG(log_buf, scale_op.describe());
CUDNN_DATA_HALF
,
dy1
,
w
,
dy2
,
z
,
relu1
);
/*
// Create a Gen_Index Node.
// backward strided conv cannot be fused
auto genIndex_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
// if stride == 1 but channel changes, we can fuse here
.setxDesc(std::get<DGRAD_OPTIONAL_TENSOR>(tensors))
if (stride_1X1 != 1){
.setyDesc(std::get<DGRAD_GEN_INDEX_TENSOR>(tensors))
// dgrad
.setpwDesc(genIndexDesc)
run_dconv(outdimA1,
.build();
padA1,
DEBUG_CUDNN_MSG(log_buf, genIndex_op.describe());
convstride1X1,
dilationA,
filterdimA2,
outdimA2,
CUDNN_DATA_HALF,
dy1,
w,
dy2,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);
// mul fused mask
// Create a LessThan Node.
grad_out1.mul_(inputs[15]);
auto lessThan_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
}
.setxDesc(std::get<DGRAD_GEN_INDEX_TENSOR>(tensors))
else {
.setbDesc(std::get<DGRAD_THRESHOLD_TOP_TENSOR>(tensors))
at::Half* relu1 = inputs[12].data_ptr<at::Half>();
.setyDesc(std::get<DGRAD_MASK_TOP_TENSOR>(tensors))
// fused dgrad
.setpwDesc(lessThanDesc)
run_dconv_drelu_dscale(outdimA1,
.build();
padA1,
DEBUG_CUDNN_MSG(log_buf, lessThan_op.describe());
convstride1X1,
dilationA,
filterdimA2,
outdimA2,
CUDNN_DATA_HALF,
dy1,
w,
dy2,
z,
relu1);
}
*/
DEBUG_MSG
(
"[DEBUG] new dconv1 : "
<<
grad_out1
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
// create grads of conv4 that may exist
// Create a GreaterThan Node.
auto
grad_x_conv4
=
at
::
empty_like
(
inputs
[
0
]);
auto greaterThan_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
at
::
Half
*
dx_conv4
=
grad_x_conv4
.
data_ptr
<
at
::
Half
>
();
.setxDesc(std::get<DGRAD_GEN_INDEX_TENSOR>(tensors))
at
::
Tensor
wgrad4
;
.setbDesc(std::get<DGRAD_THRESHOLD_BOTTOM_TENSOR>(tensors))
.setyDesc(std::get<DGRAD_MASK_BOTTOM_TENSOR>(tensors))
.setpwDesc(greaterThanDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, greaterThan_op.describe());
// x used for dconv1 and dconv4 wgrad
// Create a LogicalOr Node.
at
::
Half
*
x
=
inputs
[
0
].
data_ptr
<
at
::
Half
>
();
auto logicalOr_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.setxDesc(std::get<DGRAD_MASK_TOP_TENSOR>(tensors))
.setbDesc(std::get<DGRAD_MASK_BOTTOM_TENSOR>(tensors))
.setyDesc(std::get<DGRAD_MASK_TENSOR>(tensors))
.setpwDesc(logicalOrDesc)
.build();
DEBUG_CUDNN_MSG(log_buf, logicalOr_op.describe());
if
(
stride_1X1
!=
1
||
filterdimA3
[
0
]
!=
dimA
[
1
]){
// Create a Binary_Selection Node.
w
=
inputs
[
14
].
data_ptr
<
at
::
Half
>
();
auto selection_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
at
::
Half
*
dy_conv4
=
inputs
[
11
].
data_ptr
<
at
::
Half
>
();
.setxDesc(std::get<AFTER_DCONV_TENSOR>(tensors))
if
(
requires_grad
)
{
.setbDesc(std::get<DGRAD_OPTIONAL_TENSOR>(tensors))
run_dconv
(
dimA
,
.settDesc(std::get<DGRAD_MASK_TENSOR>(tensors))
padA
,
.setyDesc(std::get<X_OR_DX_TENSOR>(tensors))
convstride1X1
,
.setpwDesc(selectionDesc)
dilationA
,
.build();
filterdimA4
,
DEBUG_CUDNN_MSG(log_buf, selection_op.describe());
outdimA3
,
CUDNN_DATA_HALF
,
dx_conv4
,
w
,
dy_conv4
,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
);
// we don't print here since we can't hook out this grad in pytorch alone to compare, due to addition with dx
// DEBUG_MSG("[DEBUG] new dx_identity : " << grad_x_conv4.to(at::kFloat).sum().item<float>());
}
// wgrad
wgrad4
=
at
::
empty_like
(
inputs
[
14
]);
at
::
Half
*
dw4
=
wgrad4
.
data_ptr
<
at
::
Half
>
();
run_dconv
(
dimA
,
padA
,
convstride1X1
,
dilationA
,
filterdimA4
,
outdimA3
,
CUDNN_DATA_HALF
,
x
,
dw4
,
dy_conv4
,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
);
}
else
{
// if there is no downsample, dx_conv4 is fork of drelu3
dx_conv4
=
inputs
[
11
].
data_ptr
<
at
::
Half
>
();
}
// dconv1+add
// Create an Operation Graph. In this case it is convolution add bias activation
// wgrad
std::array<cudnn_frontend::Operation const*, 8> ops = {&conv_op, &act_op, &scale_op, &genIndex_op, &lessThan_op, &greaterThan_op, &logicalOr_op, &selection_op};
auto
wgrad1
=
at
::
empty_like
(
inputs
[
1
]);
at
::
Half
*
dw1
=
wgrad1
.
data_ptr
<
at
::
Half
>
();
run_dconv
(
dimA
,
padA
,
convstride1X1
,
dilationA
,
filterdimA1
,
outdimA1
,
CUDNN_DATA_HALF
,
x
,
dw1
,
dy1
,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
);
// dgrad
auto opGraph = cudnn_frontend::OperationGraphBuilder()
w
=
inputs
[
1
].
data_ptr
<
at
::
Half
>
();
.setHandle(handle_)
auto
grad_x
=
at
::
empty_like
(
inputs
[
0
]);
.setOperationGraph(ops.size(), ops.data())
at
::
Half
*
dx
=
grad_x
.
data_ptr
<
at
::
Half
>
();
.build
();
// backward strided conv cannot be fused
// Create string encoding for plan caching
// if stride == 1 but channel changes, we can fuse here
auto cache_string = getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
if
(
requires_grad
){
DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);
if
(
stride_1X1
!=
1
){
run_dconv
(
dimA
,
padA
,
convstride1X1
,
dilationA
,
filterdimA1
,
outdimA1
,
CUDNN_DATA_HALF
,
dx
,
w
,
dy1
,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
);
// add 2 together
grad_x
.
add_
(
grad_x_conv4
);
}
else
{
run_dconv_add
(
dimA
,
padA
,
convstride1X1
,
dilationA
,
filterdimA1
,
outdimA1
,
CUDNN_DATA_HALF
,
dx
,
w
,
dy1
,
dx_conv4
);
}
}
DEBUG_MSG
(
"[DEBUG] new dx : "
<<
grad_x
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
DEBUG_MSG
(
"[DEBUG] new wgrad1 : "
<<
wgrad1
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());
DEBUG_MSG
(
"[DEBUG] new wgrad2 : "
<<
wgrad2
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
DEBUG_MSG
(
"[DEBUG] new wgrad3 : "
<<
wgrad3
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
outputs
.
push_back
(
grad_x
);
outputs
.
push_back
(
wgrad1
);
outputs
.
push_back
(
wgrad2
);
outputs
.
push_back
(
wgrad3
);
if
(
stride_1X1
!=
1
||
filterdimA3
[
0
]
!=
dimA
[
1
])
{
auto workspace_size = plan.getWorkspaceSize();
DEBUG_MSG
(
"[DEBUG] new wgrad4 : "
<<
wgrad4
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);
outputs
.
push_back
(
wgrad4
);
}
return
outputs
;
void* workspace_ptr = nullptr;
auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
if (workspace_size > 0) {
workspace_ptr = workspace_tensor.data_ptr<float>();
}
void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrZ, devPtrR, devPtrT, devPtrU};
int64_t uids[] = {'x', 'y', 'w', 's', 'r', 't', 'u'};
auto variantPack = cudnn_frontend::VariantPackBuilder()
.setWorkspacePointer(workspace_ptr)
.setDataPointers(7, data_ptrs)
.setUids(7, uids)
.build();
DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
checkCudnnErr(status);
cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
} catch (cudnn_frontend::cudnnException e) {
std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
}
}
}
namespace
{
struct bottleneck_forward_status {
struct bottleneck_forward_status {
int64_t dimA[4];
int64_t dimA[4];
...
...
apex/contrib/csrc/peer_memory/peer_memory_cuda.cu
View file @
fa8e7d99
...
@@ -388,7 +388,7 @@ void push_pull_halos_1d(
...
@@ -388,7 +388,7 @@ void push_pull_halos_1d(
const
int
numThreads
=
128
;
const
int
numThreads
=
128
;
dim3
block
(
numThreads
,
1
,
1
);
dim3
block
(
numThreads
,
1
,
1
);
AT_DISPATCH_ALL_TYPES_AND
(
at
::
ScalarType
::
Half
,
top_out_halo
.
scalar_type
(),
"push_pull_halos_1d_kernel"
,
[
&
]{
AT_DISPATCH_ALL_TYPES_AND
(
at
::
ScalarType
::
Half
,
top_out_halo
.
scalar_type
(),
"push_pull_halos_1d_kernel"
,
[
&
]{
if
(
diagnostics
)
printf
(
"size(scalar_t) = %d
\n
"
,
sizeof
(
scalar_t
));
if
(
diagnostics
)
printf
(
"size(scalar_t) = %
l
d
\n
"
,
sizeof
(
scalar_t
));
scalar_t
*
toh_p
=
top_out_halo
.
data_ptr
<
scalar_t
>
();
scalar_t
*
toh_p
=
top_out_halo
.
data_ptr
<
scalar_t
>
();
scalar_t
*
tox_p
=
top_out_tx
.
data_ptr
<
scalar_t
>
();
scalar_t
*
tox_p
=
top_out_tx
.
data_ptr
<
scalar_t
>
();
scalar_t
*
tix_p
=
top_inp_tx
.
data_ptr
<
scalar_t
>
();
scalar_t
*
tix_p
=
top_inp_tx
.
data_ptr
<
scalar_t
>
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment