Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
a9a392b4
Commit
a9a392b4
authored
Aug 01, 2019
by
Chao Liu
Browse files
experimenting TensorCoordinate and new merged tensor copy operator
parent
2eeeb176
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
159 additions
and
172 deletions
+159
-172
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
...ridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+45
-0
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...ridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+0
-34
composable_kernel/include/tensor_description/tensor_coordinate.hpp
...e_kernel/include/tensor_description/tensor_coordinate.hpp
+43
-101
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+1
-1
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+69
-35
driver/src/driver.cpp
driver/src/driver.cpp
+1
-1
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
a9a392b4
...
@@ -155,6 +155,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
...
@@ -155,6 +155,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
static_assert
(
in_e_n1_b_n2_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
static_assert
(
in_e_n1_b_n2_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
"GemmDataPerReadB alignment requirement is not satisfied"
);
"GemmDataPerReadB alignment requirement is not satisfied"
);
#if 0 // debug
// input blockwise copy
// input blockwise copy
// slice a merged tensor, reorder and copy to a normal tensor
// slice a merged tensor, reorder and copy to a normal tensor
// this copy operator already has blockwise offset built-in
// this copy operator already has blockwise offset built-in
...
@@ -172,6 +173,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
...
@@ -172,6 +173,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
InBlockCopySrcDataPerRead_B,
InBlockCopySrcDataPerRead_B,
InBlockCopyDstDataPerWrite_N2>(
InBlockCopyDstDataPerWrite_N2>(
{0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
{0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
#else
auto
blockwise_in_copy
=
BlockwiseGenericTensorSliceCopy_v2
<
BlockSize
,
Float
,
decltype
(
in_e_n1_b_n2_global_merged_desc
),
decltype
(
in_e_n1_b_n2_block_desc
),
MergedTensorCoordinate
<
decltype
(
in_e_n1_b_n2_global_merged_desc
)
>
,
NormalTensorCoordinate
<
decltype
(
in_e_n1_b_n2_block_desc
)
>
,
decltype
(
in_e_n1_b_n2_block_desc
.
GetLengths
()),
InBlockCopySubLengths_E_N1_B_N2
,
InBlockCopyClusterLengths_E_N1_B_N2
,
InBlockCopyThreadClusterArrangeOrder
>
({
0
,
0
,
b_block_data_on_global
,
0
},
{
0
,
0
,
0
,
0
});
#endif
// weight tensor
// weight tensor
// tensor descriptor in device memory, src of blockwise copy
// tensor descriptor in device memory, src of blockwise copy
...
@@ -184,6 +198,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
...
@@ -184,6 +198,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
Sequence
<
EPerBlock
,
KPerBlock
>
{},
Sequence
<
EPerBlock
,
KPerBlock
>
{},
Number
<
math
::
lcm
(
WeiBlockCopyDstDataPerWrite_K
,
GemmDataPerReadA
)
>
{});
Number
<
math
::
lcm
(
WeiBlockCopyDstDataPerWrite_K
,
GemmDataPerReadA
)
>
{});
#if 0 // debug
// operator for blockwise copy of weight into LDS
// operator for blockwise copy of weight into LDS
// slice a tensor, and copy it into another tensor
// slice a tensor, and copy it into another tensor
// this copy operator already have blockwise offset built-in
// this copy operator already have blockwise offset built-in
...
@@ -201,6 +216,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
...
@@ -201,6 +216,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
WeiBlockCopySrcDataPerRead_E,
WeiBlockCopySrcDataPerRead_E,
WeiBlockCopyDstDataPerWrite_K>(
WeiBlockCopyDstDataPerWrite_K>(
{0, k_block_data_on_global}, {0, 0});
{0, k_block_data_on_global}, {0, 0});
#else
auto
blockwise_wei_copy
=
BlockwiseGenericTensorSliceCopy_v2
<
BlockSize
,
Float
,
decltype
(
wei_e_k_global_desc
),
decltype
(
wei_e_k_block_desc
),
NormalTensorCoordinate
<
decltype
(
wei_e_k_global_desc
)
>
,
NormalTensorCoordinate
<
decltype
(
wei_e_k_block_desc
)
>
,
decltype
(
wei_e_k_block_desc
.
GetLengths
()),
WeiBlockCopySubLengths_E_K
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopyThreadClusterArrangeOrder
>
({
0
,
k_block_data_on_global
},
{
0
,
0
});
#endif
// GEMM definition
// GEMM definition
// c_mtx += transpose(a_mtx) * b_mtx
// c_mtx += transpose(a_mtx) * b_mtx
...
@@ -277,8 +305,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
...
@@ -277,8 +305,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
__syncthreads
();
__syncthreads
();
#if 0
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
#else
blockwise_in_copy
.
MoveSrcSlicingWindow
({
EPerBlock
,
0
,
0
,
0
},
true
);
blockwise_wei_copy
.
MoveSrcSlicingWindow
({
EPerBlock
,
0
},
true
);
#endif
}
}
// copy output: register to global memory
// copy output: register to global memory
...
@@ -328,6 +361,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
...
@@ -328,6 +361,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
out_k_n1_b_n2_global_merged_desc
.
GetOffsetFromMultiIndex
(
out_k_n1_b_n2_global_merged_desc
.
GetOffsetFromMultiIndex
(
k_thread_data_on_global
,
0
,
b_thread_data_on_global
,
0
);
k_thread_data_on_global
,
0
,
b_thread_data_on_global
,
0
);
#if 0 // debug
threadwise_generic_tensor_slice_copy_v1(
threadwise_generic_tensor_slice_copy_v1(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
p_out_thread,
p_out_thread,
...
@@ -338,6 +372,17 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
...
@@ -338,6 +372,17 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
arithmetic_sequence_gen<0, 8, 1>::type{},
arithmetic_sequence_gen<0, 8, 1>::type{},
Number<1>{});
Number<1>{});
#else
ThreadwiseGenericTensorSliceCopy_v2
<
Float
,
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
),
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
),
NormalTensorCoordinate
<
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
)
>
,
MergedTensorCoordinate
<
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
)
>
,
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
.
GetLengths
())
>
(
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
},
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
})
.
Run
(
p_out_thread
,
p_out_thread_on_global
);
#endif
}
}
}
}
};
};
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
a9a392b4
...
@@ -301,40 +301,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
...
@@ -301,40 +301,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
b_thread_data_on_global % B1});
b_thread_data_on_global % B1});
threadwise_out_copy.Run(p_out_thread, p_out_thread_on_global);
threadwise_out_copy.Run(p_out_thread, p_out_thread_on_global);
#elif
0
// This is a hack, because slicing a merged dimension is not supported yet.
// This should be replaced with logic above, once slicing a merged dimension support
// become available
// dst descriptor
constexpr
auto
out_k0_k1_b_global_desc
=
make_ConstantMergedTensorDescriptor
(
out_n_k_h_w_global_desc
.
Fold
(
I1
,
Number
<
K1
>
{}),
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
0
,
3
,
4
>
{});
// src descriptor
constexpr
auto
out_k0_k1_b_thread_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
GemmMRepeat
,
GemmMPerThreadSubC
,
GemmNRepeat
*
GemmNPerThreadSubC
>
{});
auto
threadwise_out_copy
=
ThreadwiseGenericTensorSliceCopy_v2
<
Float
,
decltype
(
out_k0_k1_b_thread_desc
),
decltype
(
out_k0_k1_b_global_desc
),
NormalTensorCoordinate
<
decltype
(
out_k0_k1_b_thread_desc
)
>
,
MergedTensorCoordinate
<
decltype
(
out_k0_k1_b_global_desc
)
>
,
Sequence
<
GemmMRepeat
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
>>
(
{
0
,
0
,
0
},
{
k_thread_data_on_global
/
K1
,
k_thread_data_on_global
%
K1
,
b_thread_data_on_global
});
for
(
index_t
nrepeat
=
0
;
nrepeat
<
GemmNRepeat
;
++
nrepeat
)
{
threadwise_out_copy
.
Run
(
p_out_thread
,
p_out_global
);
threadwise_out_copy
.
MoveSrcSlicingWindow
({
0
,
0
,
GemmNPerThreadSubC
},
true
);
threadwise_out_copy
.
MoveDstSlicingWindow
({
0
,
0
,
B1
},
true
);
}
#elif
1
#elif
1
// This is a hack, because slicing a merged dimension is not supported yet.
// This is a hack, because slicing a merged dimension is not supported yet.
// This should be replaced with logic above, once slicing a merged dimension support
// This should be replaced with logic above, once slicing a merged dimension support
...
...
composable_kernel/include/tensor_description/tensor_coordinate.hpp
View file @
a9a392b4
...
@@ -16,7 +16,7 @@ struct NormalTensorCoordinate
...
@@ -16,7 +16,7 @@ struct NormalTensorCoordinate
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
__host__
__device__
constexpr
NormalTensorCoordinate
(
Array
<
index_t
,
nDim
>
tensor_index
)
__host__
__device__
constexpr
NormalTensorCoordinate
(
Array
<
index_t
,
nDim
>
tensor_index
)
:
mIndex
{
tensor_index
},
mOffset
{
tensor_desc_type
::
GetOffsetFromMultiIndex
(
tensor_index
)}
:
mOffset
{
tensor_desc_type
::
GetOffsetFromMultiIndex
(
tensor_index
)}
{
{
}
}
...
@@ -26,38 +26,15 @@ struct NormalTensorCoordinate
...
@@ -26,38 +26,15 @@ struct NormalTensorCoordinate
{
{
}
}
__host__
__device__
constexpr
Array
<
unsigned
,
nDim
>
GetIndex
()
const
{
return
mIndex
;
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mOffset
;
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mOffset
;
}
template
<
class
IDim
,
bool
PositiveDirection
>
__host__
__device__
void
MoveOnDimension
(
IDim
idim
,
index_t
step_size
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
if
(
PositiveDirection
)
{
mIndex
(
idim
)
+=
step_size
;
mOffset
+=
step_size
*
tensor_desc_type
::
GetStride
(
idim
);
}
else
{
mIndex
(
idim
)
-=
step_size
;
mOffset
-=
step_size
*
tensor_desc_type
::
GetStride
(
idim
);
}
}
// T is Array or Sequence
// T is Array or Sequence
template
<
class
T
>
template
<
class
T
>
__host__
__device__
type
operator
+=
(
T
step_sizes
)
__host__
__device__
type
operator
+=
(
T
step_sizes
)
{
{
#if 0
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
static_assert(is_same<typename T::data_type, index_t>, "wrong!");
#endif
static_assert
(
T
::
GetSize
()
==
nDim
,
"wrong!"
);
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
mOffset
+=
tensor_desc_type
::
GetOffsetFromMultiIndex
(
step_sizes
);
this
->
MoveOnDimension
(
idim
,
step_sizes
[
idim
],
integral_constant
<
bool
,
true
>
{});
});
return
*
this
;
return
*
this
;
}
}
...
@@ -65,14 +42,9 @@ struct NormalTensorCoordinate
...
@@ -65,14 +42,9 @@ struct NormalTensorCoordinate
template
<
class
T
>
template
<
class
T
>
__host__
__device__
type
operator
-=
(
T
step_sizes
)
__host__
__device__
type
operator
-=
(
T
step_sizes
)
{
{
#if 0
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
static_assert(is_same<typename T::data_type, index_t>, "wrong!");
#endif
static_assert
(
T
::
GetSize
()
==
nDim
,
"wrong!"
);
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
mOffset
-=
tensor_desc_type
::
GetOffsetFromMultiIndex
(
step_sizes
);
this
->
MoveOnDimension
(
idim
,
step_sizes
[
idim
],
integral_constant
<
bool
,
false
>
{});
});
return
*
this
;
return
*
this
;
}
}
...
@@ -93,19 +65,25 @@ struct NormalTensorCoordinate
...
@@ -93,19 +65,25 @@ struct NormalTensorCoordinate
return
coord
;
return
coord
;
}
}
// reposition point of origin, and return compensated offset
// reposition point of origin, and return compensated offset.
// This is a hack to reduce index calculation during looping over
// a tensor whose origin is this TensorCoordinate. It does so, by spitting
// out the run-time offset to the pointer (to the tensor data) held by this
// TensorCoordiante, so the caller can add the offset into the run-time pointer of
// the data, so only 1 run-time variable (update pointer) is needed, instead
// of 2 run-time variables (old pointer and this offset)
// TODO: after introducing the concept of "run-time tensor view", which contains the
// run-time pointer to the data, always keep track of the pointer, instead of both
// offset and the pointer. This also bring additional benefit that we don't need to
// worry the offset might underflow (because offset is unsigned integer) when updating it.
__host__
__device__
constexpr
index_t
RepositionOrigin
()
__host__
__device__
constexpr
index_t
RepositionOrigin
()
{
{
index_t
offset_diff
=
mOffset
;
index_t
offset_diff
=
mOffset
;
mOffset
=
0
;
mIndex
=
make_zero_array
<
index_t
,
nDim
>
();
mOffset
=
0
;
return
offset_diff
;
return
offset_diff
;
}
}
// private:
private:
Array
<
index_t
,
nDim
>
mIndex
;
index_t
mOffset
;
index_t
mOffset
;
};
};
...
@@ -120,8 +98,7 @@ struct MergedTensorCoordinate
...
@@ -120,8 +98,7 @@ struct MergedTensorCoordinate
tensor_desc_type
::
GetOriginalTensorDescriptor
().
GetNumOfDimension
();
tensor_desc_type
::
GetOriginalTensorDescriptor
().
GetNumOfDimension
();
__host__
__device__
constexpr
MergedTensorCoordinate
(
Array
<
index_t
,
nDim
>
tensor_index
)
__host__
__device__
constexpr
MergedTensorCoordinate
(
Array
<
index_t
,
nDim
>
tensor_index
)
:
mIndex
{
tensor_index
},
:
mOriginalIndex
{
tensor_desc_type
::
GetOriginalMultiIndexFromMultiIndex
(
tensor_index
)}
mOriginalIndex
{
tensor_desc_type
::
GetOriginalMultiIndexFromMultiIndex
(
tensor_index
)}
{
{
// partial offset on each dimension
// partial offset on each dimension
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
...
@@ -146,8 +123,6 @@ struct MergedTensorCoordinate
...
@@ -146,8 +123,6 @@ struct MergedTensorCoordinate
{
{
}
}
__host__
__device__
constexpr
Array
<
index_t
,
nDim
>
GetIndex
()
const
{
return
mIndex
;
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mOffset
;
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mOffset
;
}
// step_size should be known at compile time
// step_size should be known at compile time
...
@@ -157,17 +132,7 @@ struct MergedTensorCoordinate
...
@@ -157,17 +132,7 @@ struct MergedTensorCoordinate
{
{
constexpr
auto
idim
=
IDim
{};
constexpr
auto
idim
=
IDim
{};
// update multi-index
// update original index
if
(
PositiveDirection
)
{
mIndex
(
idim
)
+=
step_size
;
}
else
{
mIndex
(
idim
)
-=
step_size
;
}
// update rest
static_if
<
tensor_desc_type
::
ContainMultipleOriginalDimensions
(
idim
)
>
{}([
&
](
auto
)
{
static_if
<
tensor_desc_type
::
ContainMultipleOriginalDimensions
(
idim
)
>
{}([
&
](
auto
)
{
constexpr
auto
partial_original_dims
=
constexpr
auto
partial_original_dims
=
tensor_desc_type
::
GetContainedOriginalDimensions
(
idim
);
tensor_desc_type
::
GetContainedOriginalDimensions
(
idim
);
...
@@ -253,19 +218,10 @@ struct MergedTensorCoordinate
...
@@ -253,19 +218,10 @@ struct MergedTensorCoordinate
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mOffset
=
(
mOffset
+
mPartialOffsets
[
idim
])
-
old_partial_offset
;
mOffset
=
(
mOffset
+
mPartialOffsets
[
idim
])
-
old_partial_offset
;
}).
Else
([
&
](
auto
)
{
}).
Else
([
&
](
auto
fwd
)
{
constexpr
auto
idim_original
=
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
tensor_desc_type
::
GetContainedOriginalDimensions
(
idim
).
Front
();
static_if
<
PositiveDirection
>
{}([
&
](
auto
fwd
)
{
mOriginalIndex
(
idim_original
)
+=
step_size
;
mPartialOffsets
(
idim
)
+=
step_size
*
fwd
(
tensor_desc_type
{}).
GetStride
(
idim
);
mOffset
+=
step_size
*
fwd
(
tensor_desc_type
{}).
GetStride
(
idim
);
mOffset
+=
step_size
*
fwd
(
tensor_desc_type
{}).
GetStride
(
idim
);
}).
Else
([
&
](
auto
fwd
)
{
}).
Else
([
&
](
auto
)
{
mOffset
-=
step_size
*
fwd
(
tensor_desc_type
{}).
GetStride
(
idim
);
});
mOriginalIndex
(
idim_original
)
-=
step_size
;
mPartialOffsets
(
idim
)
-=
step_size
*
fwd
(
tensor_desc_type
{}).
GetStride
(
idim
);
mOffset
-=
step_size
*
fwd
(
tensor_desc_type
{}).
GetStride
(
idim
);
});
});
});
}
}
...
@@ -273,10 +229,9 @@ struct MergedTensorCoordinate
...
@@ -273,10 +229,9 @@ struct MergedTensorCoordinate
template
<
class
T
>
template
<
class
T
>
__host__
__device__
type
operator
+=
(
T
step_sizes
)
__host__
__device__
type
operator
+=
(
T
step_sizes
)
{
{
#if 0
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
static_assert(is_same<typename T::data_type, index_t>, "wrong!");
#endif
index_t
normal_offset_diff
=
0
;
static_assert
(
T
::
GetSize
()
==
nDim
,
"wrong!"
);
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
this
->
MoveOnDimension
(
idim
,
step_sizes
[
idim
],
integral_constant
<
bool
,
true
>
{});
this
->
MoveOnDimension
(
idim
,
step_sizes
[
idim
],
integral_constant
<
bool
,
true
>
{});
...
@@ -288,10 +243,7 @@ struct MergedTensorCoordinate
...
@@ -288,10 +243,7 @@ struct MergedTensorCoordinate
template
<
class
T
>
template
<
class
T
>
__host__
__device__
type
operator
-=
(
T
step_sizes
)
__host__
__device__
type
operator
-=
(
T
step_sizes
)
{
{
#if 0
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
static_assert(is_same<typename T::data_type, index_t>, "wrong!");
#endif
static_assert
(
T
::
GetSize
()
==
nDim
,
"wrong!"
);
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
this
->
MoveOnDimension
(
idim
,
step_sizes
[
idim
],
integral_constant
<
bool
,
false
>
{});
this
->
MoveOnDimension
(
idim
,
step_sizes
[
idim
],
integral_constant
<
bool
,
false
>
{});
...
@@ -316,33 +268,23 @@ struct MergedTensorCoordinate
...
@@ -316,33 +268,23 @@ struct MergedTensorCoordinate
return
coord
;
return
coord
;
}
}
// reposition point of origin, and return compensated offset
__host__
__device__
static
constexpr
index_t
RepositionOrigin
()
{
return
0
;
}
__host__
__device__
constexpr
index_t
RepositionOrigin
()
{
private:
index_t
offset_diff
=
0
;
// Allocate register memory for all merged dimensions and normal dimensions.
// However, only those merged dimensions, whose index will be involved in arithmetic
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim_
)
{
// after the construction of this TensorCoordinate (e.g. when user move a slicing
constexpr
auto
idim
=
decltype
(
idim_
){};
// window on the merged dimension), will use these register memory.
// Let's hope compiler will optimize away those register memory allocated for normal
static_if
<!
tensor_desc_type
::
ContainMultipleOriginalDimensions
(
idim
)
>
{}([
&
](
auto
)
{
// dimensions, and those merged dimensions, that would never be involved in index
constexpr
auto
idim_original
=
// arithmetic after construction of TensorCoordinate.
tensor_desc_type
::
GetContainedOriginalDimensions
(
idim
).
Front
();
// TODO: refactor TensorCoordinate, after introducing the concept of "dimensions"
// and simplify implementation of ConstantMergedTensorDescriptor, so we don't need to
mIndex
(
idim
)
=
0
;
// count on compiler to optimize way those register memory for us
mOriginalIndex
(
idim_original
)
=
0
;
mOffset
-=
mPartialOffsets
[
idim
];
offset_diff
+=
mPartialOffsets
[
idim
];
mPartialOffsets
(
idim
)
=
0
;
});
});
return
offset_diff
;
}
// private:
Array
<
index_t
,
nDim
>
mIndex
;
Array
<
index_t
,
nOriginalDim
>
mOriginalIndex
;
Array
<
index_t
,
nOriginalDim
>
mOriginalIndex
;
Array
<
index_t
,
nDim
>
mPartialOffsets
;
// mPartialOffsets is needed for for unsigned index type
Array
<
index_t
,
nDim
>
mPartialOffsets
;
// complete offset
index_t
mOffset
;
index_t
mOffset
;
};
};
...
...
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
a9a392b4
...
@@ -139,7 +139,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -139,7 +139,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
{
constexpr
auto
gridwise_conv
=
constexpr
auto
gridwise_conv
=
#if
0
#if
1
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
#else
#else
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
...
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
a9a392b4
...
@@ -85,6 +85,40 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
...
@@ -85,6 +85,40 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
1
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
#elif 1
// 1x1 filter, 8x8 image
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
EPerBlock
=
8
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmNLevel0Cluster
=
4
;
constexpr
index_t
GemmMLevel1Cluster
=
4
;
constexpr
index_t
GemmNLevel1Cluster
=
4
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockCopySubLengths_E_B
=
Sequence
<
2
,
2
>
;
using
InBlockCopyClusterLengths_E_B
=
Sequence
<
4
,
64
>
;
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
constexpr
index_t
InBlockCopyDataPerAccess_B
=
1
;
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
4
,
1
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
2
,
128
>
;
using
WeiBlockCopyThreadClusterArrangeOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
1
;
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
1
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
#endif
#endif
...
@@ -96,43 +130,43 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
...
@@ -96,43 +130,43 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
constexpr
auto
gridwise_conv
=
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
<
GridSize
,
BlockSize
,
T
,
decltype
(
in_nchw_desc
),
decltype
(
wei_kcyx_desc
),
decltype
(
out_nkhw_desc
),
ConvStrides
,
ConvDilations
,
BPerBlock
,
KPerBlock
,
EPerBlock
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
,
GemmDataPerReadA
,
GemmDataPerReadB
,
InBlockCopySubLengths_E_B
,
InBlockCopyClusterLengths_E_B
,
InBlockCopyThreadClusterArrangeOrder
,
InBlockCopySrcAccessOrder
,
InBlockCopyDstAccessOrder
,
InBlockCopyDataPerAccess_B
,
WeiBlockCopySubLengths_E_K
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopyThreadClusterArrangeOrder
,
WeiBlockCopySrcAccessOrder
,
WeiBlockCopyDstAccessOrder
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
>
{};
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
{
constexpr
auto
gridwise_conv
=
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
<
GridSize
,
BlockSize
,
T
,
decltype
(
in_nchw_desc
),
decltype
(
wei_kcyx_desc
),
decltype
(
out_nkhw_desc
),
ConvStrides
,
ConvDilations
,
BPerBlock
,
KPerBlock
,
EPerBlock
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
,
GemmDataPerReadA
,
GemmDataPerReadB
,
InBlockCopySubLengths_E_B
,
InBlockCopyClusterLengths_E_B
,
InBlockCopyThreadClusterArrangeOrder
,
InBlockCopySrcAccessOrder
,
InBlockCopyDstAccessOrder
,
InBlockCopyDataPerAccess_B
,
WeiBlockCopySubLengths_E_K
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopyThreadClusterArrangeOrder
,
WeiBlockCopySrcAccessOrder
,
WeiBlockCopyDstAccessOrder
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
>
{};
float
time
=
launch_kernel
(
run_gridwise_convolution_kernel
<
decltype
(
gridwise_conv
),
T
>
,
float
time
=
launch_kernel
(
run_gridwise_convolution_kernel
<
decltype
(
gridwise_conv
),
T
>
,
dim3
(
GridSize
),
dim3
(
GridSize
),
dim3
(
BlockSize
),
dim3
(
BlockSize
),
...
...
driver/src/driver.cpp
View file @
a9a392b4
...
@@ -71,7 +71,7 @@ int main(int argc, char* argv[])
...
@@ -71,7 +71,7 @@ int main(int argc, char* argv[])
{
{
using
namespace
ck
;
using
namespace
ck
;
#if
0
#if
1
constexpr
index_t
N
=
64
;
constexpr
index_t
N
=
64
;
constexpr
index_t
C
=
1536
;
constexpr
index_t
C
=
1536
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
HI
=
8
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment