Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
39d92e7d
"...composable_kernel-1.git" did not exist on "22d438ae9e02b674e1656263df07799ee06fb466"
Commit
39d92e7d
authored
Sep 25, 2019
by
Chao Liu
Browse files
removing old implementation of tensor descriptor
parent
012b5253
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
128 additions
and
64 deletions
+128
-64
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
...cit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+28
-38
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
...cit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+1
-1
composable_kernel/include/tensor_description/multi_index_transform.hpp
...rnel/include/tensor_description/multi_index_transform.hpp
+1
-2
composable_kernel/include/tensor_description/tensor_coordinate.hpp
...e_kernel/include/tensor_description/tensor_coordinate.hpp
+2
-2
composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
...ernel/include/tensor_description/tensor_coordinate_v2.hpp
+5
-4
composable_kernel/include/tensor_description/tensor_descriptor.hpp
...e_kernel/include/tensor_description/tensor_descriptor.hpp
+1
-1
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
...l/include/tensor_description/tensor_descriptor_helper.hpp
+75
-0
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+4
-4
composable_kernel/include/utility/config_amd.hpp.in
composable_kernel/include/utility/config_amd.hpp.in
+8
-3
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
..._convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
+1
-7
driver/src/driver.cpp
driver/src/driver.cpp
+2
-2
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
View file @
39d92e7d
...
@@ -2,7 +2,6 @@
...
@@ -2,7 +2,6 @@
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_PADDED_LDS_DOUBLE_BUFFER_HPP
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_PADDED_LDS_DOUBLE_BUFFER_HPP
#include "common_header.hpp"
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "ConstantMatrixDescriptor.hpp"
...
@@ -73,12 +72,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
...
@@ -73,12 +72,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
in_n_c_hi_wi_global_desc
=
constexpr
auto
in_n_c_hi_wi_global_desc
=
InGlobalDesc
{};
make_native_tensor_descriptor
(
InGlobalDesc
::
GetLengths
(),
InGlobalDesc
::
GetStrides
());
constexpr
auto
wei_k_c_y_x_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
wei_k_c_y_x_global_desc
=
constexpr
auto
out_n_k_ho_wo_global_desc
=
OutGlobalDesc
{};
make_native_tensor_descriptor
(
WeiGlobalDesc
::
GetLengths
(),
WeiGlobalDesc
::
GetStrides
());
constexpr
auto
out_n_k_ho_wo_global_desc
=
make_native_tensor_descriptor
(
OutGlobalDesc
::
GetLengths
(),
OutGlobalDesc
::
GetStrides
());
constexpr
index_t
N
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I0
);
constexpr
index_t
N
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I0
);
constexpr
index_t
C
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I1
);
constexpr
index_t
C
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I1
);
...
@@ -119,11 +115,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
...
@@ -119,11 +115,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
constexpr
index_t
KBlockWork
=
K
/
KPerBlock
;
constexpr
index_t
KBlockWork
=
K
/
KPerBlock
;
constexpr
index_t
BBlockWork
=
B
/
BPerBlock
;
constexpr
index_t
BBlockWork
=
B
/
BPerBlock
;
constexpr
auto
block_work_desc
=
constexpr
auto
block_work_desc
=
transform_tensor_descriptor
(
make_ConstantTensorDescriptor_packed
(
Sequence
<
KBlockWork
,
BBlockWork
>
{});
make_native_tensor_descriptor_packed
(
Sequence
<
KBlockWork
,
BBlockWork
>
{}),
make_tuple
(
Merge
<
Sequence
<
KBlockWork
,
BBlockWork
>>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
block_work_multi_id
=
const
auto
block_work_multi_id
=
block_work_desc
.
CalculateLowerIndex
(
get_block_1d_id
());
block_work_desc
.
GetMultiIndexFrom1dIndex
(
get_block_1d_id
());
const
index_t
k_block_data_on_global
=
block_work_multi_id
[
0
]
*
KPerBlock
;
const
index_t
k_block_data_on_global
=
block_work_multi_id
[
0
]
*
KPerBlock
;
const
index_t
b_block_data_on_global
=
block_work_multi_id
[
1
]
*
BPerBlock
;
const
index_t
b_block_data_on_global
=
block_work_multi_id
[
1
]
*
BPerBlock
;
...
@@ -139,7 +137,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
...
@@ -139,7 +137,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
constexpr
auto
in_n0_n1_n2_c_y_ho_x_wo_global_desc
=
transform_tensor_descriptor
(
constexpr
auto
in_n0_n1_n2_c_y_ho_x_wo_global_desc
=
transform_tensor_descriptor
(
in_n_c_hip_wip_global_desc
,
in_n_c_hip_wip_global_desc
,
make_tuple
(
Un
m
erge
<
Sequence
<
N0
,
N1
,
N2
>>
{},
make_tuple
(
Un
M
erge
<
Sequence
<
N0
,
N1
,
N2
>>
{},
PassThrough
<
C
>
{},
PassThrough
<
C
>
{},
Embed
<
Sequence
<
Y
,
Ho
>
,
Sequence
<
ConvDilationH
,
ConvStrideH
,
0
>>
{},
Embed
<
Sequence
<
Y
,
Ho
>
,
Sequence
<
ConvDilationH
,
ConvStrideH
,
0
>>
{},
Embed
<
Sequence
<
X
,
Wo
>
,
Sequence
<
ConvDilationW
,
ConvStrideW
,
0
>>
{}),
Embed
<
Sequence
<
X
,
Wo
>
,
Sequence
<
ConvDilationW
,
ConvStrideW
,
0
>>
{}),
...
@@ -185,21 +183,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
...
@@ -185,21 +183,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
InBlockCopyDstDataPerWrite_N2
>
(
InBlockCopyDstDataPerWrite_N2
>
(
{
0
,
0
,
b_block_data_on_global
,
0
},
{
0
,
0
,
0
,
0
});
{
0
,
0
,
b_block_data_on_global
,
0
},
{
0
,
0
,
0
,
0
});
#if 0
// weight tensor
// weight tensor
// tensor descriptor in device memory, src of blockwise copy
// tensor descriptor in device memory, src of blockwise copy
constexpr
auto
wei_k_e_global_desc
=
unfold_tensor_descriptor
(
wei_k_c_y_x_global_desc
,
I1
,
I3
);
constexpr
auto
wei_e_k_global_desc
=
constexpr
auto
wei_e_k_global_desc
=
transform_tensor_descriptor(wei_k_c_y_x_global_desc,
reorder_tensor_descriptor_given_upper2lower
(
wei_k_e_global_desc
,
Sequence
<
1
,
0
>
{});
make_tuple(Merge<Sequence<C, Y, X>>{}, PassThrough<K>{}),
make_tuple(Sequence<1, 2, 3>{}, Sequence<0>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
#else
// hack
constexpr
auto
wei_e_k_global_desc_old
=
WeiGlobalDesc
::
Unfold
(
I1
,
I3
).
ReorderGivenNew2Old
(
Sequence
<
1
,
0
>
{});
constexpr
auto
wei_e_k_global_desc
=
make_native_tensor_descriptor
(
wei_e_k_global_desc_old
.
GetLengths
(),
wei_e_k_global_desc_old
.
GetStrides
());
#endif
// tensor descriptor in LDS, dst of blockwise copy
// tensor descriptor in LDS, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
...
@@ -340,10 +330,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
...
@@ -340,10 +330,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
blockwise_gemm
.
Run
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
blockwise_gemm
.
Run
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
// LDS double buffer: store next data to LDS
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
template
RunStoreRegisterBuffer
<
Float
,
address_space_t
::
lds
>(
blockwise_in_copy
.
RunStoreRegisterBuffer
(
p_in_register_buffer
,
p_in_block_next
);
p_in_register_buffer
,
p_in_block_next
);
blockwise_wei_copy
.
RunStoreRegisterBuffer
(
p_wei_register_buffer
,
p_wei_block_next
);
blockwise_wei_copy
.
template
RunStoreRegisterBuffer
<
Float
,
address_space_t
::
lds
>(
p_wei_register_buffer
,
p_wei_block_next
);
}
}
}
}
...
@@ -368,10 +356,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
...
@@ -368,10 +356,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
// LDS double buffer: store next data to LDS
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
template
RunStoreRegisterBuffer
<
Float
,
address_space_t
::
lds
>(
blockwise_in_copy
.
RunStoreRegisterBuffer
(
p_in_register_buffer
,
p_in_register_buffer
,
p_in_block_double
+
in_block_space
);
p_in_block_double
+
in_block_space
);
blockwise_wei_copy
.
template
RunStoreRegisterBuffer
<
Float
,
address_space_t
::
lds
>(
blockwise_wei_copy
.
RunStoreRegisterBuffer
(
p_wei_register_buffer
,
p_wei_register_buffer
,
p_wei_block_double
+
wei_block_space
);
p_wei_block_double
+
wei_block_space
);
// odd iteration
// odd iteration
__syncthreads
();
__syncthreads
();
...
@@ -393,12 +381,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
...
@@ -393,12 +381,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
Sequence
<
GemmMRepeat
,
GemmMPerThreadSubC
,
N1
,
1
,
N2
>
{});
Sequence
<
GemmMRepeat
,
GemmMPerThreadSubC
,
N1
,
1
,
N2
>
{});
// output memory layout descriptor in device memory
// output memory layout descriptor in device memory
constexpr
auto
out_n0_n1_n2_k0_k1_ho_wo_global_desc_old
=
constexpr
auto
out_n0_n1_n2_k0_k1_ho_wo_global_desc
=
transform_tensor_descriptor
(
OutGlobalDesc
::
Fold
(
I1
,
Number
<
K1
>
{}).
Fold
(
I0
,
Number
<
N1
>
{},
Number
<
N2
>
{});
out_n_k_ho_wo_global_desc
,
make_tuple
(
UnMerge
<
Sequence
<
N0
,
N1
,
N2
>>
{},
constexpr
auto
out_n0_n1_n2_k0_k1_ho_wo_global_desc
=
make_native_tensor_descriptor
(
UnMerge
<
Sequence
<
K0
,
K1
>>
{},
out_n0_n1_n2_k0_k1_ho_wo_global_desc_old
.
GetLengths
(),
PassThrough
<
Ho
>
{},
out_n0_n1_n2_k0_k1_ho_wo_global_desc_old
.
GetStrides
());
PassThrough
<
Wo
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
,
1
,
2
>
{},
Sequence
<
3
,
4
>
{},
Sequence
<
5
>
{},
Sequence
<
6
>
{}));
// output merged global tensor descriptor, dst of threadwise copy
// output merged global tensor descriptor, dst of threadwise copy
constexpr
auto
out_k0_k1_n1_b_n2_global_desc
=
transform_tensor_descriptor
(
constexpr
auto
out_k0_k1_n1_b_n2_global_desc
=
transform_tensor_descriptor
(
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
View file @
39d92e7d
...
@@ -384,7 +384,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
...
@@ -384,7 +384,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
constexpr
auto
out_k0_k1_b0_b1_global_desc
=
transform_tensor_descriptor
(
constexpr
auto
out_k0_k1_b0_b1_global_desc
=
transform_tensor_descriptor
(
out_k_b_global_desc
,
out_k_b_global_desc
,
make_tuple
(
Un
m
erge
<
Sequence
<
K0
,
K1
>>
{},
Un
m
erge
<
Sequence
<
B0
,
B1
>>
{}),
make_tuple
(
Un
M
erge
<
Sequence
<
K0
,
K1
>>
{},
Un
M
erge
<
Sequence
<
B0
,
B1
>>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{},
Sequence
<
2
,
3
>
{}));
make_tuple
(
Sequence
<
0
,
1
>
{},
Sequence
<
2
,
3
>
{}));
...
...
composable_kernel/include/tensor_description/multi_index_transform.hpp
View file @
39d92e7d
...
@@ -252,7 +252,6 @@ struct Merge
...
@@ -252,7 +252,6 @@ struct Merge
});
});
// highest dimension, no out-of-bound check
// highest dimension, no out-of-bound check
if
(
borrow
)
if
(
borrow
)
{
{
--
idx_low_new
(
0
);
--
idx_low_new
(
0
);
...
@@ -273,7 +272,7 @@ struct Merge
...
@@ -273,7 +272,7 @@ struct Merge
// UpperLengths: Sequence<...>
// UpperLengths: Sequence<...>
template
<
typename
UpperLengths
>
template
<
typename
UpperLengths
>
struct
Un
m
erge
struct
Un
M
erge
{
{
static
constexpr
index_t
nDimLow
=
1
;
static
constexpr
index_t
nDimLow
=
1
;
static
constexpr
index_t
nDimUp
=
UpperLengths
::
Size
();
static
constexpr
index_t
nDimUp
=
UpperLengths
::
Size
();
...
...
composable_kernel/include/tensor_description/tensor_coordinate.hpp
View file @
39d92e7d
...
@@ -325,14 +325,14 @@ struct TensorCoordinate
...
@@ -325,14 +325,14 @@ struct TensorCoordinate
private:
private:
template
<
class
...
Ts
>
template
<
class
...
Ts
>
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
ConstantTensorDescriptor
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
ConstantTensorDescriptor
<
Ts
...
>
)
{
{
return
NormalTensorCoordinate
<
ConstantTensorDescriptor
<
Ts
...
>>
();
return
NormalTensorCoordinate
<
ConstantTensorDescriptor
<
Ts
...
>>
();
}
}
template
<
class
...
Ts
>
template
<
class
...
Ts
>
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
ConstantMergedTensorDescriptor
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
ConstantMergedTensorDescriptor
<
Ts
...
>
)
{
{
return
MergedTensorCoordinate
<
ConstantMergedTensorDescriptor
<
Ts
...
>>
();
return
MergedTensorCoordinate
<
ConstantMergedTensorDescriptor
<
Ts
...
>>
();
}
}
...
...
composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
View file @
39d92e7d
...
@@ -81,7 +81,7 @@ struct NativeTensorCoordinate
...
@@ -81,7 +81,7 @@ struct NativeTensorCoordinate
__host__
__device__
static
constexpr
bool
IsUpperIndexMappedToValidOffset
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsUpperIndexMappedToValidOffset
()
{
return
true
;
}
private:
private:
// mIndex may be saved and update, however, the value of some (or all) of its entries may
// mIndex may be saved and update
d
, however, the value of some (or all) of its entries may
// never be used. Compiler should be able to remove these entries as well as its calculation
// never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// as dead code.
// TODO: make sure compiler indeed remove these dead code
// TODO: make sure compiler indeed remove these dead code
...
@@ -178,7 +178,8 @@ struct TransformedTensorCoordinate
...
@@ -178,7 +178,8 @@ struct TransformedTensorCoordinate
}
}
private:
private:
// mIndexUp may be calculated and update, however, the value of some (or all) of its entries may
// mIndexUp may be calculated and updated, however, the value of some (or all) of its entries
// may
// never be used. Compiler should be able to remove these entries as well as its calculation
// never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// as dead code.
// TODO: make sure compiler indeed remove these dead code
// TODO: make sure compiler indeed remove these dead code
...
@@ -192,7 +193,7 @@ struct TensorCoordinate_v2
...
@@ -192,7 +193,7 @@ struct TensorCoordinate_v2
private:
private:
template
<
typename
...
Ts
>
template
<
typename
...
Ts
>
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
NativeTensorDescriptor
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
NativeTensorDescriptor
<
Ts
...
>
)
{
{
return
NativeTensorCoordinate
<
NativeTensorDescriptor
<
Ts
...
>>
(
return
NativeTensorCoordinate
<
NativeTensorDescriptor
<
Ts
...
>>
(
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
...
@@ -200,7 +201,7 @@ struct TensorCoordinate_v2
...
@@ -200,7 +201,7 @@ struct TensorCoordinate_v2
template
<
typename
...
Ts
>
template
<
typename
...
Ts
>
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
TransformedTensorDescriptor
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
TransformedTensorDescriptor
<
Ts
...
>
)
{
{
return
TransformedTensorCoordinate
<
TransformedTensorDescriptor
<
Ts
...
>>
(
return
TransformedTensorCoordinate
<
TransformedTensorDescriptor
<
Ts
...
>>
(
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
...
...
composable_kernel/include/tensor_description/tensor_descriptor.hpp
View file @
39d92e7d
...
@@ -299,7 +299,7 @@ struct TransformedTensorDescriptor
...
@@ -299,7 +299,7 @@ struct TransformedTensorDescriptor
return
GetLowerTensorDescriptor
().
GetElementSpace
();
return
GetLowerTensorDescriptor
().
GetElementSpace
();
}
}
// TODO: right now return value is constexpr because use of non-conste
p
xr lambda
// TODO: right now return value is
not
constexpr because use of non-constex
p
r lambda
__host__
__device__
static
constexpr
LowerIndex
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
__host__
__device__
static
constexpr
LowerIndex
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
{
LowerIndex
idx_low
;
LowerIndex
idx_low
;
...
...
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
View file @
39d92e7d
...
@@ -96,6 +96,81 @@ __host__ __device__ constexpr auto
...
@@ -96,6 +96,81 @@ __host__ __device__ constexpr auto
LowerTensorDescriptor
{},
typename
sequence_map_inverse
<
MapUpper2Lower
>::
type
{});
LowerTensorDescriptor
{},
typename
sequence_map_inverse
<
MapUpper2Lower
>::
type
{});
}
}
template
<
typename
Lengths
,
typename
Strides
>
__host__
__device__
constexpr
bool
AreDimensionsUnfoldable
(
Lengths
,
Strides
)
{
static_assert
(
Lengths
::
Size
()
==
Strides
::
Size
(),
"wrong!"
);
bool
flag
=
true
;
for
(
index_t
i
=
0
;
i
<
Lengths
::
Size
()
-
1
;
++
i
)
{
flag
=
flag
&&
Strides
::
At
(
i
)
==
Strides
::
At
(
i
+
1
)
*
Lengths
::
At
(
i
+
1
);
}
return
flag
;
}
// unfold only support NativeTennsorDescriptor, for now
template
<
index_t
FirstUnfoldDim
,
index_t
LastUnfoldDim
,
typename
...
Ts
>
__host__
__device__
constexpr
auto
unfold_tensor_descriptor
(
NativeTensorDescriptor
<
Ts
...
>
desc
,
Number
<
FirstUnfoldDim
>
,
Number
<
LastUnfoldDim
>
)
{
constexpr
index_t
nDim
=
desc
.
GetNumOfDimension
();
static_assert
(
FirstUnfoldDim
>=
0
&&
LastUnfoldDim
<
nDim
&&
FirstUnfoldDim
<=
LastUnfoldDim
,
"wrong! should have FirstUnfoldDim <= LastUnfoldDim!"
);
// left and right
constexpr
auto
left
=
typename
arithmetic_sequence_gen
<
0
,
FirstUnfoldDim
,
1
>::
type
{};
constexpr
auto
middle
=
typename
arithmetic_sequence_gen
<
FirstUnfoldDim
,
LastUnfoldDim
+
1
,
1
>::
type
{};
constexpr
auto
right
=
typename
arithmetic_sequence_gen
<
LastUnfoldDim
+
1
,
nDim
,
1
>::
type
{};
// sanity-checknfoldable
static_assert
(
AreDimensionsUnfoldable
(
desc
.
GetLengths
(
middle
),
desc
.
GetStrides
(
middle
)),
"wrong! not unfoldable"
);
// unfolded length, stride
constexpr
index_t
unfold_length
=
reduce_on_sequence
(
desc
.
GetLengths
(
middle
),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
constexpr
index_t
unfold_stride
=
desc
.
GetStride
(
Number
<
LastUnfoldDim
>
{});
// new lengths, strides
constexpr
auto
new_lengths
=
desc
.
GetLengths
(
left
).
PushBack
(
Number
<
unfold_length
>
{}).
PushBack
(
desc
.
GetLengths
(
right
));
constexpr
auto
new_strides
=
desc
.
GetStrides
(
left
).
PushBack
(
Number
<
unfold_stride
>
{}).
PushBack
(
desc
.
GetStrides
(
right
));
return
make_native_tensor_descriptor
(
new_lengths
,
new_strides
);
}
#if 0
template <typename LowerTensorDescriptor,
typename PadDimensionIds,
typename LeftPads,
typename RightPads>
__host__ __device__ constexpr auto
pad_tensor_descriptor(LowerTensorDescriptor, PadLowerDimensionIds, LeftPads, RightPads)
{
constexpr index_t nDim = LowerTensorDescriptor::GetNumOfDimension();
constexpr auto non_pad_low_dim_ids = xxx;
return transform_tensor_descriptor(
LowerTensorDescriptor{},
make_tuple(Pad<decltype(LowerTensorDescriptor::GetLengths(PadLowerDimensionIds{})),
LeftPads,
RightPads>{})
.PushBack(PassThrough<xxxx>...),
make_tuple(PadLowerDimensionIds{}).PushBack(xxxx),
sequence_to_tuple(typename arithmetic_sequence_gen<0, nDim, 1> i::type{}));
}
#endif
template
<
typename
...
NativeDimensions
>
template
<
typename
...
NativeDimensions
>
__host__
__device__
void
__host__
__device__
void
print_tensor_descriptor
(
const
char
*
s
,
const
NativeTensorDescriptor
<
NativeDimensions
...
>&
desc
)
print_tensor_descriptor
(
const
char
*
s
,
const
NativeTensorDescriptor
<
NativeDimensions
...
>&
desc
)
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
39d92e7d
...
@@ -738,12 +738,12 @@ struct BlockwiseGenericTensorSliceCopy_v4
...
@@ -738,12 +738,12 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__
void
RunLoadRegisterBuffer
(
const
TData
*
p_src
,
TData
*
p_buffer
)
const
__device__
void
RunLoadRegisterBuffer
(
const
TData
*
p_src
,
TData
*
p_buffer
)
const
{
{
#if 1
#if 1
mThreadwiseLoad
.
template
Run_generic
<
TData
,
SrcAddressSpace
,
address_space_t
::
vgpr
>(
mThreadwiseLoad
.
template
Run_generic
<
TData
,
SrcAddressSpace
,
address_space_t
::
generic
>(
p_src
,
p_buffer
);
p_src
,
p_buffer
);
#else
#else
mThreadwiseLoad
.
template
Run_optimized_src_address_calculation
<
TData
,
mThreadwiseLoad
.
template
Run_optimized_src_address_calculation
<
TData
,
SrcAddressSpace
,
SrcAddressSpace
,
address_space_t
::
vgpr
>(
address_space_t
::
generic
>(
p_src
,
p_buffer
);
p_src
,
p_buffer
);
#endif
#endif
}
}
...
@@ -752,11 +752,11 @@ struct BlockwiseGenericTensorSliceCopy_v4
...
@@ -752,11 +752,11 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__
void
RunStoreRegisterBuffer
(
const
TData
*
p_buffer
,
TData
*
p_dst
)
const
__device__
void
RunStoreRegisterBuffer
(
const
TData
*
p_buffer
,
TData
*
p_dst
)
const
{
{
#if 1
#if 1
mThreadwiseStore
.
template
Run_generic
<
TData
,
address_space_t
::
vgpr
,
DstAddressSpace
>(
mThreadwiseStore
.
template
Run_generic
<
TData
,
address_space_t
::
generic
,
DstAddressSpace
>(
p_buffer
,
p_dst
);
p_buffer
,
p_dst
);
#else
#else
mThreadwiseStore
.
template
Run_optimized_dst_address_calculation
<
TData
,
mThreadwiseStore
.
template
Run_optimized_dst_address_calculation
<
TData
,
address_space_t
::
vgpr
,
address_space_t
::
generic
,
DstAddressSpace
>(
p_buffer
,
DstAddressSpace
>(
p_buffer
,
p_dst
);
p_dst
);
#endif
#endif
...
...
composable_kernel/include/utility/config_amd.hpp.in
View file @
39d92e7d
...
@@ -37,11 +37,16 @@ typedef float float4_t __attribute__((ext_vector_type(4)));
...
@@ -37,11 +37,16 @@ typedef float float4_t __attribute__((ext_vector_type(4)));
typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
// data type conversion
template <class T>
template <class T>
__device__ void fused_multiply_accumulate(T& d, const T& s0, const T& s1)
struct type_convert
{
{
d += s0 * s1;
template <class X>
}
__device__ T operator()(X x) const
{
return static_cast<T>(x);
}
};
} // namespace ck
} // namespace ck
...
...
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
View file @
39d92e7d
...
@@ -33,18 +33,12 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
...
@@ -33,18 +33,12 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
#if 1
constexpr
auto
in_nchw_desc
=
InDesc
{};
constexpr
auto
wei_kcyx_desc
=
WeiDesc
{};
constexpr
auto
out_nkhw_desc
=
OutDesc
{};
#else
constexpr
auto
in_nchw_desc
=
constexpr
auto
in_nchw_desc
=
make_native_tensor_descriptor
(
InDesc
::
GetLengths
(),
InDesc
::
GetStrides
());
make_native_tensor_descriptor
(
InDesc
::
GetLengths
(),
InDesc
::
GetStrides
());
constexpr
auto
wei_kcyx_desc
=
constexpr
auto
wei_kcyx_desc
=
make_native_tensor_descriptor
(
WeiDesc
::
GetLengths
(),
WeiDesc
::
GetStrides
());
make_native_tensor_descriptor
(
WeiDesc
::
GetLengths
(),
WeiDesc
::
GetStrides
());
constexpr
auto
out_nkhw_desc
=
constexpr
auto
out_nkhw_desc
=
make_native_tensor_descriptor
(
OutDesc
::
GetLegnths
(),
OutDesc
::
GetStrides
());
make_native_tensor_descriptor
(
OutDesc
::
GetLengths
(),
OutDesc
::
GetStrides
());
#endif
constexpr
index_t
N
=
out_nkhw_desc
.
GetLength
(
I0
);
constexpr
index_t
N
=
out_nkhw_desc
.
GetLength
(
I0
);
constexpr
index_t
K
=
out_nkhw_desc
.
GetLength
(
I1
);
constexpr
index_t
K
=
out_nkhw_desc
.
GetLength
(
I1
);
...
...
driver/src/driver.cpp
View file @
39d92e7d
...
@@ -295,7 +295,7 @@ int main(int argc, char* argv[])
...
@@ -295,7 +295,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
1
#elif
0
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
...
@@ -341,7 +341,7 @@ int main(int argc, char* argv[])
...
@@ -341,7 +341,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
3
,
0
>
;
using
LeftPads
=
Sequence
<
3
,
0
>
;
using
RightPads
=
Sequence
<
3
,
0
>
;
using
RightPads
=
Sequence
<
3
,
0
>
;
#elif
0
#elif
1
// 1x7 filter, 0x3 pad, 17x17 input
// 1x7 filter, 0x3 pad, 17x17 input
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
128
;
constexpr
index_t
C
=
128
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment