Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
23f633cd
Commit
23f633cd
authored
Jun 17, 2019
by
Chao Liu
Browse files
clean up for miopen
parent
3c0646d4
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
22 additions
and
52 deletions
+22
-52
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp
...ion_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp
+2
-8
composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
...l/include/tensor_description/ConstantMatrixDescriptor.hpp
+2
-6
composable_kernel/include/tensor_operation/blockwise_gemm.hpp
...osable_kernel/include/tensor_operation/blockwise_gemm.hpp
+3
-26
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+2
-4
composable_kernel/include/tensor_operation/threadwise_gemm.hpp
...sable_kernel/include/tensor_operation/threadwise_gemm.hpp
+1
-1
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+1
-0
composable_kernel/include/utility/Sequence.hpp
composable_kernel/include/utility/Sequence.hpp
+1
-1
composable_kernel/include/utility/amd_inline_asm.hpp
composable_kernel/include/utility/amd_inline_asm.hpp
+1
-1
composable_kernel/include/utility/config_amd.hpp.in
composable_kernel/include/utility/config_amd.hpp.in
+4
-0
composable_kernel/include/utility/config_nvidia.hpp.in
composable_kernel/include/utility/config_nvidia.hpp.in
+4
-0
composable_kernel/include/utility/functional.hpp
composable_kernel/include/utility/functional.hpp
+1
-1
composable_kernel/include/utility/utility.hpp
composable_kernel/include/utility/utility.hpp
+0
-4
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
23f633cd
...
@@ -66,10 +66,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -66,10 +66,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
...
@@ -77,10 +74,8 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -77,10 +74,8 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
constexpr
auto
wei_k_c_y_x_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
wei_k_c_y_x_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
out_n_k_h_w_global_desc
=
OutGlobalDesc
{};
constexpr
auto
out_n_k_h_w_global_desc
=
OutGlobalDesc
{};
constexpr
index_t
N
=
in_n_c_h_w_global_desc
.
GetLength
(
I0
);
constexpr
index_t
N
=
in_n_c_h_w_global_desc
.
GetLength
(
I0
);
constexpr
index_t
C
=
in_n_c_h_w_global_desc
.
GetLength
(
I1
);
constexpr
index_t
C
=
in_n_c_h_w_global_desc
.
GetLength
(
I1
);
constexpr
index_t
Hi
=
in_n_c_h_w_global_desc
.
GetLength
(
I2
);
constexpr
index_t
Wi
=
in_n_c_h_w_global_desc
.
GetLength
(
I3
);
constexpr
index_t
K
=
out_n_k_h_w_global_desc
.
GetLength
(
I1
);
constexpr
index_t
K
=
out_n_k_h_w_global_desc
.
GetLength
(
I1
);
constexpr
index_t
Ho
=
out_n_k_h_w_global_desc
.
GetLength
(
I2
);
constexpr
index_t
Ho
=
out_n_k_h_w_global_desc
.
GetLength
(
I2
);
...
@@ -346,7 +341,6 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -346,7 +341,6 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
{
{
constexpr
index_t
K2
=
GemmMPerThreadSubC
;
constexpr
index_t
K2
=
GemmMPerThreadSubC
;
constexpr
index_t
K1
=
GemmMLevel0Cluster
*
GemmMLevel1Cluster
;
constexpr
index_t
K1
=
GemmMLevel0Cluster
*
GemmMLevel1Cluster
;
constexpr
index_t
K0
=
K
/
(
K1
*
K2
);
// define tensor descriptor for threadwise copy
// define tensor descriptor for threadwise copy
// output memory layout descriptor in register
// output memory layout descriptor in register
...
...
composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
View file @
23f633cd
...
@@ -54,12 +54,8 @@ __host__ __device__ constexpr auto
...
@@ -54,12 +54,8 @@ __host__ __device__ constexpr auto
template
<
class
TDesc
>
template
<
class
TDesc
>
__host__
__device__
void
print_ConstantMatrixDescriptor
(
TDesc
,
const
char
*
s
)
__host__
__device__
void
print_ConstantMatrixDescriptor
(
TDesc
,
const
char
*
s
)
{
{
const
auto
desc
=
TDesc
{};
printf
(
"%s NRow %u NCol %u RowStride %u
\n
"
,
s
,
TDesc
::
NRow
(),
TDesc
::
NCol
(),
TDesc
::
RowStride
());
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
printf
(
"%s NRow %u NCol %u RowStride %u
\n
"
,
s
,
desc
.
NRow
(),
desc
.
NCol
(),
desc
.
RowStride
());
}
}
}
// namespace ck
}
// namespace ck
...
...
composable_kernel/include/tensor_operation/blockwise_gemm.hpp
View file @
23f633cd
...
@@ -49,7 +49,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
...
@@ -49,7 +49,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
constexpr
index_t
M
=
BlockMatrixA
::
NCol
();
// A is transposed
constexpr
index_t
M
=
BlockMatrixA
::
NCol
();
// A is transposed
constexpr
index_t
N
=
BlockMatrixB
::
NCol
();
constexpr
index_t
N
=
BlockMatrixB
::
NCol
();
constexpr
index_t
K
=
BlockMatrixA
::
NRow
();
static_assert
(
M
%
(
MPerThreadSubC
*
MLevel0Cluster
*
MLevel1Cluster
)
==
0
&&
static_assert
(
M
%
(
MPerThreadSubC
*
MLevel0Cluster
*
MLevel1Cluster
)
==
0
&&
N
%
(
NPerThreadSubC
*
NLevel0Cluster
*
NLevel1Cluster
)
==
0
,
N
%
(
NPerThreadSubC
*
NLevel0Cluster
*
NLevel1Cluster
)
==
0
,
...
@@ -100,12 +99,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
...
@@ -100,12 +99,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
{
{
constexpr
auto
c_thread_mtx
=
ThreadMatrixC
{};
constexpr
auto
c_thread_mtx
=
ThreadMatrixC
{};
constexpr
index_t
MPerThread
=
c_thread_mtx
.
NRow
();
constexpr
index_t
NPerThread
=
c_thread_mtx
.
NCol
();
constexpr
index_t
MRepeat
=
MPerThread
/
MPerThreadSubC
;
constexpr
index_t
NRepeat
=
NPerThread
/
NPerThreadSubC
;
constexpr
index_t
MPerLevel1Cluster
=
MPerThreadSubC
*
MLevel0Cluster
*
MLevel1Cluster
;
constexpr
index_t
MPerLevel1Cluster
=
MPerThreadSubC
*
MLevel0Cluster
*
MLevel1Cluster
;
constexpr
index_t
NPerLevel1Cluster
=
NPerThreadSubC
*
NLevel0Cluster
*
NLevel1Cluster
;
constexpr
index_t
NPerLevel1Cluster
=
NPerThreadSubC
*
NLevel0Cluster
*
NLevel1Cluster
;
...
@@ -125,9 +118,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
...
@@ -125,9 +118,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
const
FloatB
*
__restrict__
p_b_block
,
const
FloatB
*
__restrict__
p_b_block
,
FloatC
*
__restrict__
p_c_thread
)
const
FloatC
*
__restrict__
p_c_thread
)
const
{
{
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
False
=
integral_constant
<
bool
,
false
>
{};
constexpr
auto
a_block_mtx
=
BlockMatrixA
{};
constexpr
auto
a_block_mtx
=
BlockMatrixA
{};
constexpr
auto
b_block_mtx
=
BlockMatrixB
{};
constexpr
auto
b_block_mtx
=
BlockMatrixB
{};
constexpr
auto
c_thread_mtx
=
ThreadMatrixC
{};
constexpr
auto
c_thread_mtx
=
ThreadMatrixC
{};
...
@@ -146,13 +136,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
...
@@ -146,13 +136,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
constexpr
auto
b_thread_mtx
=
constexpr
auto
b_thread_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
NPerThread
>
{});
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
NPerThread
>
{});
// thread A-sub, B-sub for copy
constexpr
auto
a_thread_sub_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
MPerThreadSubC
>
{},
Number
<
MPerThread
>
{});
constexpr
auto
b_thread_sub_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
NPerThreadSubC
>
{},
Number
<
NPerThread
>
{});
FloatA
p_a_thread
[
a_thread_mtx
.
GetElementSpace
()];
FloatA
p_a_thread
[
a_thread_mtx
.
GetElementSpace
()];
FloatB
p_b_thread
[
b_thread_mtx
.
GetElementSpace
()];
FloatB
p_b_thread
[
b_thread_mtx
.
GetElementSpace
()];
...
@@ -172,9 +155,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
...
@@ -172,9 +155,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
using
Float4
=
vector_type
<
float
,
4
>::
MemoryType
;
using
Float4
=
vector_type
<
float
,
4
>::
MemoryType
;
Float4
*
reg_a
=
(
Float4
*
)
(
p_a_thread
);
Float4
*
reg_a
=
reinterpret_cast
<
Float4
*
>
(
p_a_thread
);
Float4
*
reg_b
=
(
Float4
*
)
(
p_b_thread
);
Float4
*
reg_b
=
reinterpret_cast
<
Float4
*
>
(
p_b_thread
);
Float4
*
reg_c
=
(
Float4
*
)
(
p_c_thread
);
Float4
*
reg_c
=
reinterpret_cast
<
Float4
*
>
(
p_c_thread
);
reg_a
[
0
]
=
*
reinterpret_cast
<
const
Float4
*>
(
&
p_a_block
[
mMyThreadOffsetA
]);
reg_a
[
0
]
=
*
reinterpret_cast
<
const
Float4
*>
(
&
p_a_block
[
mMyThreadOffsetA
]);
reg_b
[
0
]
=
*
reinterpret_cast
<
const
Float4
*>
(
&
p_b_block
[
mMyThreadOffsetB
]);
reg_b
[
0
]
=
*
reinterpret_cast
<
const
Float4
*>
(
&
p_b_block
[
mMyThreadOffsetB
]);
...
@@ -215,8 +198,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
...
@@ -215,8 +198,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
constexpr
auto
b_block_mtx
=
BlockMatrixB
{};
constexpr
auto
b_block_mtx
=
BlockMatrixB
{};
constexpr
auto
c_thread_mtx
=
ThreadMatrixC
{};
constexpr
auto
c_thread_mtx
=
ThreadMatrixC
{};
constexpr
index_t
M
=
a_block_mtx
.
NCol
();
constexpr
index_t
N
=
b_block_mtx
.
NCol
();
constexpr
index_t
K
=
a_block_mtx
.
NRow
();
constexpr
index_t
K
=
a_block_mtx
.
NRow
();
constexpr
index_t
MPerThread
=
c_thread_mtx
.
NRow
();
constexpr
index_t
MPerThread
=
c_thread_mtx
.
NRow
();
...
@@ -245,8 +226,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
...
@@ -245,8 +226,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
constexpr
index_t
MRepeat
=
MPerThread
/
MPerThreadSubC
;
constexpr
index_t
MRepeat
=
MPerThread
/
MPerThreadSubC
;
constexpr
index_t
NRepeat
=
NPerThread
/
NPerThreadSubC
;
constexpr
index_t
NRepeat
=
NPerThread
/
NPerThreadSubC
;
const
FloatA
*
const
p_a_block_thread_offset
=
p_a_block
+
mMyThreadOffsetA
;
#pragma unroll
#pragma unroll
// loop over k
// loop over k
for
(
index_t
k_begin
=
0
;
k_begin
<
K
;
k_begin
+=
KPerThreadLoop
)
for
(
index_t
k_begin
=
0
;
k_begin
<
K
;
k_begin
+=
KPerThreadLoop
)
...
@@ -306,8 +285,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
...
@@ -306,8 +285,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
constexpr
auto
b_block_mtx
=
BlockMatrixB
{};
constexpr
auto
b_block_mtx
=
BlockMatrixB
{};
constexpr
auto
c_thread_mtx
=
ThreadMatrixC
{};
constexpr
auto
c_thread_mtx
=
ThreadMatrixC
{};
constexpr
index_t
M
=
a_block_mtx
.
NCol
();
constexpr
index_t
N
=
b_block_mtx
.
NCol
();
constexpr
index_t
K
=
a_block_mtx
.
NRow
();
constexpr
index_t
K
=
a_block_mtx
.
NRow
();
constexpr
index_t
MPerThread
=
c_thread_mtx
.
NRow
();
constexpr
index_t
MPerThread
=
c_thread_mtx
.
NRow
();
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
23f633cd
...
@@ -93,8 +93,6 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -93,8 +93,6 @@ struct BlockwiseGenericTensorSliceCopy_v1
"wrong! cannot evenly divide sliced tensor into cluster"
);
"wrong! cannot evenly divide sliced tensor into cluster"
);
});
});
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
data_per_cluster_per_dims
;
// for now, only support SubLengths == 1 on a merged dimension that constains
// for now, only support SubLengths == 1 on a merged dimension that constains
// multiple original dimensions
// multiple original dimensions
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim_
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim_
)
{
...
@@ -297,7 +295,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -297,7 +295,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr
auto
IDim
=
Number
<
IDim_
>
{};
constexpr
auto
IDim
=
Number
<
IDim_
>
{};
constexpr
index_t
idim
=
IDim
;
constexpr
index_t
idim
=
IDim
;
static_if
<
SrcDesc
::
ContainMultipleOriginalDimensions
(
IDim
)
>
{}([
&
](
auto
fwd
)
{
static_if
<
SrcDesc
::
ContainMultipleOriginalDimensions
(
IDim
)
>
{}([
&
](
auto
)
{
// logic for a merged dimension, also works for non-merged dimension, but its logic may
// logic for a merged dimension, also works for non-merged dimension, but its logic may
// be unncessarily complicated for compiler to remove calculations that are useless for
// be unncessarily complicated for compiler to remove calculations that are useless for
// a non-merged dimension
// a non-merged dimension
...
@@ -337,7 +335,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -337,7 +335,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mThreadSrcOffset
=
(
mThreadSrcOffset
+
new_src_partial_offset
)
-
old_src_partial_offset
;
mThreadSrcOffset
=
(
mThreadSrcOffset
+
new_src_partial_offset
)
-
old_src_partial_offset
;
}).
Else
([
&
](
auto
fwd
)
{
}).
Else
([
&
](
auto
)
{
// Logic for non-merged dimension. If you are never going to move the slicing window on
// Logic for non-merged dimension. If you are never going to move the slicing window on
// a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
// a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
// which are being calculated here, will never be used later. In this case, compiler
// which are being calculated here, will never be used later. In this case, compiler
...
...
composable_kernel/include/tensor_operation/threadwise_gemm.hpp
View file @
23f633cd
...
@@ -71,7 +71,7 @@ __device__ void threadwise_gemm(MatrixA,
...
@@ -71,7 +71,7 @@ __device__ void threadwise_gemm(MatrixA,
integral_constant
<
bool
,
TransC
>
,
integral_constant
<
bool
,
TransC
>
,
FloatC
*
__restrict__
p_c_thread
)
FloatC
*
__restrict__
p_c_thread
)
{
{
static_if
<
TransA
&&
(
!
TransB
)
&&
(
!
TransC
)
>
{}([
&
](
auto
fwd
)
{
static_if
<
TransA
&&
(
!
TransB
)
&&
(
!
TransC
)
>
{}([
&
](
auto
)
{
constexpr
auto
a_mtx
=
MatrixA
{};
constexpr
auto
a_mtx
=
MatrixA
{};
constexpr
auto
b_mtx
=
MatrixB
{};
constexpr
auto
b_mtx
=
MatrixB
{};
constexpr
auto
c_mtx
=
MatrixC
{};
constexpr
auto
c_mtx
=
MatrixC
{};
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
23f633cd
...
@@ -11,6 +11,7 @@
...
@@ -11,6 +11,7 @@
namespace
ck
{
namespace
ck
{
// user need to make sure alignment requirement is satisfied when setting DataPerAccesss > 1
template
<
class
Float
,
template
<
class
Float
,
class
SrcDesc
,
class
SrcDesc
,
class
DstDesc
,
class
DstDesc
,
...
...
composable_kernel/include/utility/Sequence.hpp
View file @
23f633cd
...
@@ -253,7 +253,7 @@ __host__ __device__ constexpr auto operator+(Sequence<Xs...>, Sequence<Ys...>)
...
@@ -253,7 +253,7 @@ __host__ __device__ constexpr auto operator+(Sequence<Xs...>, Sequence<Ys...>)
}
}
template
<
index_t
...
Xs
,
index_t
...
Ys
>
template
<
index_t
...
Xs
,
index_t
...
Ys
>
__host__
__device__
constexpr
auto
operator
-
(
Sequence
<
Xs
...
>
seq_x
,
Sequence
<
Ys
...
>
seq_y
)
__host__
__device__
constexpr
auto
operator
-
(
Sequence
<
Xs
...
>
,
Sequence
<
Ys
...
>
)
{
{
static_assert
(
sizeof
...(
Xs
)
==
sizeof
...(
Ys
),
"wrong! inconsistent size"
);
static_assert
(
sizeof
...(
Xs
)
==
sizeof
...(
Ys
),
"wrong! inconsistent size"
);
...
...
composable_kernel/include/utility/amd_inline_asm.hpp
View file @
23f633cd
...
@@ -102,7 +102,7 @@ __device__ void outerProduct1x4(const float& a,
...
@@ -102,7 +102,7 @@ __device__ void outerProduct1x4(const float& a,
const
vector_type
<
float
,
4
>::
MemoryType
&
b
,
const
vector_type
<
float
,
4
>::
MemoryType
&
b
,
vector_type
<
float
,
4
>::
MemoryType
&
c
)
vector_type
<
float
,
4
>::
MemoryType
&
c
)
{
{
outerProduct1x4
(
&
a
,
(
float
*
)
&
b
,
(
float
*
)
&
c
);
outerProduct1x4
(
&
a
,
reinterpret_cast
<
const
float
*
>
(
&
b
)
,
reinterpret_cast
<
float
*
>
(
&
c
)
)
;
}
}
__device__
void
outerProduct4x4
(
const
vector_type
<
float
,
4
>::
MemoryType
&
a
,
__device__
void
outerProduct4x4
(
const
vector_type
<
float
,
4
>::
MemoryType
&
a
,
...
...
composable_kernel/include/utility/config_amd.hpp.in
View file @
23f633cd
...
@@ -19,6 +19,10 @@ typedef float float4_t __attribute__((ext_vector_type(4)));
...
@@ -19,6 +19,10 @@ typedef float float4_t __attribute__((ext_vector_type(4)));
using index_t = uint32_t;
using index_t = uint32_t;
__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
__device__ index_t get_block_1d_id() { return blockIdx.x; }
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
{
{
d += s0 * s1;
d += s0 * s1;
...
...
composable_kernel/include/utility/config_nvidia.hpp.in
View file @
23f633cd
...
@@ -23,6 +23,10 @@ using float4_t = float4;
...
@@ -23,6 +23,10 @@ using float4_t = float4;
using index_t = uint32_t;
using index_t = uint32_t;
__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
__device__ index_t get_block_1d_id() { return blockIdx.x; }
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
{
{
d += s0 * s1;
d += s0 * s1;
...
...
composable_kernel/include/utility/functional.hpp
View file @
23f633cd
...
@@ -18,7 +18,7 @@ struct forwarder
...
@@ -18,7 +18,7 @@ struct forwarder
struct
swallow
struct
swallow
{
{
template
<
class
...
Ts
>
template
<
class
...
Ts
>
__host__
__device__
constexpr
swallow
(
Ts
&&
...
ts
)
__host__
__device__
constexpr
swallow
(
Ts
&&
...)
{
{
}
}
};
};
...
...
composable_kernel/include/utility/utility.hpp
View file @
23f633cd
...
@@ -6,10 +6,6 @@
...
@@ -6,10 +6,6 @@
namespace
ck
{
namespace
ck
{
__device__
index_t
get_thread_local_1d_id
()
{
return
threadIdx
.
x
;
}
__device__
index_t
get_block_1d_id
()
{
return
blockIdx
.
x
;
}
template
<
class
X
,
class
Y
>
template
<
class
X
,
class
Y
>
using
is_same
=
std
::
is_same
<
X
,
Y
>
;
using
is_same
=
std
::
is_same
<
X
,
Y
>
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment