Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
e5874b3f
Commit
e5874b3f
authored
Nov 18, 2019
by
Chao Liu
Browse files
refactor copy and atomic
parent
95febeab
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
276 additions
and
168 deletions
+276
-168
CMakeLists.txt
CMakeLists.txt
+2
-0
composable_kernel/include/kernel_algorithm/gridwise_col2im_eb_nchw.hpp
...rnel/include/kernel_algorithm/gridwise_col2im_eb_nchw.hpp
+5
-1
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+40
-30
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+32
-95
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+71
-39
composable_kernel/include/utility/common_header.hpp
composable_kernel/include/utility/common_header.hpp
+1
-0
composable_kernel/include/utility/config.amd.hpp.in
composable_kernel/include/utility/config.amd.hpp.in
+2
-1
composable_kernel/include/utility/config.nvidia.hpp.in
composable_kernel/include/utility/config.nvidia.hpp.in
+9
-1
composable_kernel/include/utility/float_type.amd.hpp.in
composable_kernel/include/utility/float_type.amd.hpp.in
+53
-0
composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in
..._kernel/include/utility/in_memory_operation.nvidia.hpp.in
+60
-0
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+1
-1
No files found.
CMakeLists.txt
View file @
e5874b3f
...
@@ -55,9 +55,11 @@ include_directories(BEFORE
...
@@ -55,9 +55,11 @@ include_directories(BEFORE
if
(
DEVICE_BACKEND STREQUAL
"AMD"
)
if
(
DEVICE_BACKEND STREQUAL
"AMD"
)
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/utility/config.amd.hpp.in"
"
${
PROJECT_BINARY_DIR
}
/composable_kernel/include/utility/config.hpp"
)
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/utility/config.amd.hpp.in"
"
${
PROJECT_BINARY_DIR
}
/composable_kernel/include/utility/config.hpp"
)
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/utility/float_type.amd.hpp.in"
"
${
PROJECT_BINARY_DIR
}
/composable_kernel/include/utility/float_type.hpp"
)
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/utility/float_type.amd.hpp.in"
"
${
PROJECT_BINARY_DIR
}
/composable_kernel/include/utility/float_type.hpp"
)
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/utility/in_memory_operation.amd.hpp.in"
"
${
PROJECT_BINARY_DIR
}
/composable_kernel/include/utility/in_memory_operation.hpp"
)
elseif
(
DEVICE_BACKEND STREQUAL
"NVIDIA"
)
elseif
(
DEVICE_BACKEND STREQUAL
"NVIDIA"
)
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/utility/config.nvidia.hpp.in"
"
${
PROJECT_BINARY_DIR
}
/composable_kernel/include/utility/config.hpp"
)
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/utility/config.nvidia.hpp.in"
"
${
PROJECT_BINARY_DIR
}
/composable_kernel/include/utility/config.hpp"
)
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/utility/float_type.nvidia.hpp.in"
"
${
PROJECT_BINARY_DIR
}
/composable_kernel/include/utility/float_type.hpp"
)
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/utility/float_type.nvidia.hpp.in"
"
${
PROJECT_BINARY_DIR
}
/composable_kernel/include/utility/float_type.hpp"
)
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in"
"
${
PROJECT_BINARY_DIR
}
/composable_kernel/include/utility/in_memory_operation.hpp"
)
endif
()
endif
()
add_subdirectory
(
driver
)
add_subdirectory
(
driver
)
composable_kernel/include/kernel_algorithm/gridwise_col2im_eb_nchw.hpp
View file @
e5874b3f
...
@@ -113,7 +113,11 @@ struct GridwiseCol2Im_eb_nchw
...
@@ -113,7 +113,11 @@ struct GridwiseCol2Im_eb_nchw
1
,
1
,
1
,
1
,
BlockCopyDataPerAccess_B
,
BlockCopyDataPerAccess_B
,
BlockCopyDataPerAccess_B
>
(
BlockCopyDataPerAccess_B
,
AddressSpace
::
vgpr
,
AddressSpace
::
vgpr
,
AddressSpace
::
global
,
InMemoryDataOperation
::
atomic_add
>
(
{
e_block_data_on_global
,
b_block_data_on_global
},
{
e_block_data_on_global
,
b_block_data_on_global
},
{
e_block_data_on_global
,
b_block_data_on_global
});
{
e_block_data_on_global
,
b_block_data_on_global
});
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
e5874b3f
...
@@ -107,11 +107,16 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -107,11 +107,16 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
generic_address_space
=
integral_constant
<
AddressSpace
,
AddressSpace
::
generic
>
{};
constexpr
auto
global_address_space
=
constexpr
auto
global_address_space
=
integral_constant
<
AddressSpace
,
AddressSpace
::
global
>
{};
integral_constant
<
AddressSpace
,
AddressSpace
::
global
>
{};
constexpr
auto
lds_address_space
=
integral_constant
<
AddressSpace
,
AddressSpace
::
lds
>
{};
constexpr
auto
vgpr_address_space
=
integral_constant
<
AddressSpace
,
AddressSpace
::
vgpr
>
{};
constexpr
auto
no_inmem_op
=
integral_constant
<
InMemoryDataOperation
,
InMemoryDataOperation
::
none
>
{};
static_assert
(
ConvDirection
==
ConvolutionDirection
::
Forward
||
static_assert
(
ConvDirection
==
ConvolutionDirection
::
Forward
||
ConvDirection
==
ConvolutionDirection
::
BackwardWeight
,
ConvDirection
==
ConvolutionDirection
::
BackwardWeight
,
"wrong! this kernel only support convolution forward and backward-weight"
);
"wrong! this kernel only support convolution forward and backward-weight"
);
...
@@ -230,7 +235,11 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -230,7 +235,11 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
2
,
2
,
3
,
3
,
InBlockCopySrcDataPerRead_B
,
InBlockCopySrcDataPerRead_B
,
InBlockCopyDstDataPerWrite_N2
>
(
InBlockCopyDstDataPerWrite_N2
,
AddressSpace
::
global
,
AddressSpace
::
vgpr
,
AddressSpace
::
lds
,
InMemoryDataOperation
::
none
>
(
{
0
,
0
,
b_block_data_on_global
,
0
},
{
0
,
0
,
0
,
0
});
{
0
,
0
,
b_block_data_on_global
,
0
},
{
0
,
0
,
0
,
0
});
// weight tensor
// weight tensor
...
@@ -266,7 +275,11 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -266,7 +275,11 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
0
,
0
,
1
,
1
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
>
(
WeiBlockCopyDstDataPerWrite_K
,
AddressSpace
::
global
,
AddressSpace
::
vgpr
,
AddressSpace
::
lds
,
InMemoryDataOperation
::
none
>
(
{
0
,
k_block_data_on_global
},
{
0
,
0
});
{
0
,
k_block_data_on_global
},
{
0
,
0
});
// GEMM definition
// GEMM definition
...
@@ -334,10 +347,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -334,10 +347,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
// LDS double buffer: preload data into LDS
// LDS double buffer: preload data into LDS
{
{
blockwise_in_copy
.
Run
(
blockwise_in_copy
.
Run
(
p_in_global
,
p_in_block_double
);
p_in_global
,
p_in_block_double
,
global_address_space
,
generic_address_space
);
blockwise_wei_copy
.
Run
(
p_wei_global
,
p_wei_block_double
);
blockwise_wei_copy
.
Run
(
p_wei_global
,
p_wei_block_double
,
global_address_space
,
generic_address_space
);
}
}
// LDS double buffer: main body
// LDS double buffer: main body
...
@@ -368,10 +379,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -368,10 +379,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
__syncthreads
();
__syncthreads
();
// LDS doubel buffer: load next data from device mem
// LDS doubel buffer: load next data from device mem
blockwise_in_copy
.
RunLoadThreadBuffer
(
blockwise_in_copy
.
RunLoadThreadBuffer
(
p_in_global
,
p_in_thread_buffer
);
p_in_global
,
p_in_thread_buffer
,
global_address_space
,
generic_address_space
);
blockwise_wei_copy
.
RunLoadThreadBuffer
(
p_wei_global
,
p_wei_thread_buffer
);
blockwise_wei_copy
.
RunLoadThreadBuffer
(
p_wei_global
,
p_wei_thread_buffer
,
global_address_space
,
generic_address_space
);
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
blockwise_gemm
.
Run
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
...
@@ -397,10 +406,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -397,10 +406,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
__syncthreads
();
__syncthreads
();
// LDS double buffer: load last data from device mem
// LDS double buffer: load last data from device mem
blockwise_in_copy
.
RunLoadThreadBuffer
(
blockwise_in_copy
.
RunLoadThreadBuffer
(
p_in_global
,
p_in_thread_buffer
);
p_in_global
,
p_in_thread_buffer
,
global_address_space
,
generic_address_space
);
blockwise_wei_copy
.
RunLoadThreadBuffer
(
p_wei_global
,
p_wei_thread_buffer
);
blockwise_wei_copy
.
RunLoadThreadBuffer
(
p_wei_global
,
p_wei_thread_buffer
,
global_address_space
,
generic_address_space
);
// LDS double buffer: GEMM on 2nd-last data
// LDS double buffer: GEMM on 2nd-last data
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
...
@@ -474,20 +481,23 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -474,20 +481,23 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
const
index_t
b_thread_data_on_global
=
const
index_t
b_thread_data_on_global
=
b_block_data_on_global
+
c_thread_mtx_on_block
.
col
/
N2
;
b_block_data_on_global
+
c_thread_mtx_on_block
.
col
/
N2
;
ThreadwiseGenericTensorSliceCopy_v4r2
<
decltype
(
out_k0_k1_n1_b_n2_thread_desc
),
ThreadwiseGenericTensorSliceCopy_v4r2
<
decltype
(
out_k0_k1_n1_b_n2_thread_desc
),
decltype
(
out_k0_k1_n1_b_n2_global_desc
),
decltype
(
out_k0_k1_n1_b_n2_global_desc
),
decltype
(
decltype
(
out_k0_k1_n1_b_n2_thread_desc
.
GetLengths
()),
out_k0_k1_n1_b_n2_thread_desc
.
GetLengths
()),
arithmetic_sequence_gen
<
0
,
5
,
1
>::
type
,
arithmetic_sequence_gen
<
0
,
5
,
1
>::
type
,
3
,
3
,
1
,
1
,
1
>
({
0
,
0
,
0
,
0
,
0
},
1
,
AddressSpace
::
vgpr
,
AddressSpace
::
global
,
InMemoryDataOperation
::
none
>
({
0
,
0
,
0
,
0
,
0
},
{
k_thread_data_on_global
/
K1
,
{
k_thread_data_on_global
/
K1
,
k_thread_data_on_global
%
K1
,
k_thread_data_on_global
%
K1
,
0
,
0
,
b_thread_data_on_global
,
b_thread_data_on_global
,
0
})
0
})
.
Run
(
p_out_thread
,
p_out_global
,
generic_address_space
,
global_address_space
);
.
Run
(
p_out_thread
,
p_out_global
);
}
}
}
}
};
};
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
e5874b3f
...
@@ -21,7 +21,11 @@ template <index_t BlockSize,
...
@@ -21,7 +21,11 @@ template <index_t BlockSize,
index_t
SrcVectorAccessDim
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
index_t
DstDataPerAccess
,
AddressSpace
SrcAddressSpace
=
AddressSpace
::
generic
,
AddressSpace
ThreadBufferAddressSpace
=
AddressSpace
::
generic
,
AddressSpace
DstAddressSpace
=
AddressSpace
::
generic
,
InMemoryDataOperation
DstInMemOp
=
InMemoryDataOperation
::
none
>
struct
BlockwiseGenericTensorSliceCopy_v4
struct
BlockwiseGenericTensorSliceCopy_v4
{
{
static
constexpr
index_t
nDim
=
BlockSrcDesc
::
GetNumOfDimension
();
static
constexpr
index_t
nDim
=
BlockSrcDesc
::
GetNumOfDimension
();
...
@@ -66,130 +70,57 @@ struct BlockwiseGenericTensorSliceCopy_v4
...
@@ -66,130 +70,57 @@ struct BlockwiseGenericTensorSliceCopy_v4
return
ThreadBufferDesc
::
GetElementSpace
();
return
ThreadBufferDesc
::
GetElementSpace
();
}
}
template
<
typename
BlockSrcData
,
template
<
typename
BlockSrcData
,
typename
ThreadBufferData
>
typename
ThreadBufferData
,
__device__
void
RunLoadThreadBuffer
(
const
BlockSrcData
*
p_block_src
,
AddressSpace
BlockSrcAddressSpace
,
ThreadBufferData
*
p_thread_buffer
)
const
AddressSpace
ThreadBufferAddressSpace
>
__device__
void
RunLoadThreadBuffer
(
const
BlockSrcData
*
p_block_src
,
ThreadBufferData
*
p_thread_buffer
,
integral_constant
<
AddressSpace
,
BlockSrcAddressSpace
>
,
integral_constant
<
AddressSpace
,
ThreadBufferAddressSpace
>
)
const
{
{
constexpr
auto
block_src_address_space
=
integral_constant
<
AddressSpace
,
BlockSrcAddressSpace
>
{};
constexpr
auto
thread_buffer_address_space
=
integral_constant
<
AddressSpace
,
ThreadBufferAddressSpace
>
{};
constexpr
bool
has_optimized_address_calculation
=
constexpr
bool
has_optimized_address_calculation
=
decltype
(
mThreadwiseStore
)
::
HasWorkingOptimizedAddressCalculation
();
decltype
(
mThreadwiseStore
)
::
HasWorkingOptimizedAddressCalculation
();
// TODO: threadwise copy is still being tweaked
// TODO: threadwise copy is still being tweaked
if
(
has_optimized_address_calculation
)
if
(
has_optimized_address_calculation
)
{
{
mThreadwiseLoad
.
Run_optimized_src_address_calculation
(
mThreadwiseLoad
.
Run_optimized_src_address_calculation
(
p_block_src
,
p_thread_buffer
);
p_block_src
,
p_thread_buffer
,
block_src_address_space
,
thread_buffer_address_space
);
}
}
else
else
{
{
mThreadwiseLoad
.
Run
(
mThreadwiseLoad
.
Run
(
p_block_src
,
p_thread_buffer
);
p_block_src
,
p_thread_buffer
,
block_src_address_space
,
thread_buffer_address_space
);
}
}
}
}
template
<
typename
BlockSrcData
,
typename
ThreadBufferData
>
template
<
typename
ThreadBufferData
,
typename
BlockDstData
>
__device__
void
RunLoadThreadBuffer
(
const
BlockSrcData
*
p_block_src
,
__device__
void
RunStoreThreadBuffer
(
const
ThreadBufferData
*
p_thread_buffer
,
ThreadBufferData
*
p_thread_buffer
)
const
BlockDstData
*
p_block_dst
)
const
{
constexpr
auto
generic_address_space
=
integral_constant
<
AddressSpace
,
AddressSpace
::
generic
>
{};
RunLoadThreadBuffer
(
p_block_src
,
p_thread_buffer
,
generic_address_space
,
generic_address_space
);
}
template
<
typename
ThreadBufferData
,
typename
BlockDstData
,
AddressSpace
ThreadBufferAddressSpace
,
AddressSpace
BlockDstAddressSpace
>
__device__
void
RunStoreThreadBuffer
(
const
ThreadBufferData
*
p_thread_buffer
,
BlockDstData
*
p_block_dst
,
integral_constant
<
AddressSpace
,
ThreadBufferAddressSpace
>
,
integral_constant
<
AddressSpace
,
BlockDstAddressSpace
>
)
const
{
{
constexpr
auto
thread_buffer_address_space
=
integral_constant
<
AddressSpace
,
ThreadBufferAddressSpace
>
{};
constexpr
auto
block_dst_address_space
=
integral_constant
<
AddressSpace
,
BlockDstAddressSpace
>
{};
constexpr
bool
has_optimized_address_calculation
=
constexpr
bool
has_optimized_address_calculation
=
decltype
(
mThreadwiseStore
)
::
HasWorkingOptimizedAddressCalculation
();
decltype
(
mThreadwiseStore
)
::
HasWorkingOptimizedAddressCalculation
();
// TODO: threadwise copy is still being tweaked
// TODO: threadwise copy is still being tweaked
if
(
has_optimized_address_calculation
)
if
(
has_optimized_address_calculation
)
{
{
mThreadwiseStore
.
Run_optimized_dst_address_calculation
(
mThreadwiseStore
.
Run_optimized_dst_address_calculation
(
p_thread_buffer
,
p_block_dst
);
p_thread_buffer
,
p_block_dst
,
thread_buffer_address_space
,
block_dst_address_space
);
}
}
else
else
{
{
#if 0 // debug
mThreadwiseStore
.
Run
(
p_thread_buffer
,
p_block_dst
);
mThreadwiseStore.Run(
p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space);
#else
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
mThreadwiseStore
.
Run
(
p_thread_buffer
,
p_block_dst
,
thread_buffer_address_space
,
block_dst_address_space
,
True
);
#endif
}
}
}
}
template
<
typename
ThreadBufferData
,
typename
BlockDstData
>
template
<
typename
BlockSrcData
,
typename
BlockDstData
>
__device__
void
RunStoreThreadBuffer
(
const
ThreadBufferData
*
p_thread_buffer
,
__device__
void
Run
(
const
BlockSrcData
*
p_block_src
,
BlockDstData
*
p_block_dst
)
const
BlockDstData
*
p_block_dst
)
const
{
{
constexpr
auto
generic_address_space
=
static_assert
(
ThreadBufferAddressSpace
==
AddressSpace
::
vgpr
,
integral_constant
<
AddressSpace
,
AddressSpace
::
generic
>
{};
"wrong! This function use vgpr as its thread "
"buffer. However, you have set RunLoadThreadBuffer and RunStoreThreadBuffer "
"to use ThreadBufferAddressSpace as their thread buffer, which is not vgpr. "
"Behavior may be different"
);
RunStoreThreadBuffer
(
p_thread_buffer
,
p_block_dst
,
generic_address_space
,
generic_address_space
);
}
template
<
typename
BlockSrcData
,
typename
BlockDstData
,
AddressSpace
BlockSrcAddressSpace
,
AddressSpace
BlockDstAddressSpace
>
__device__
void
Run
(
const
BlockSrcData
*
p_block_src
,
BlockDstData
*
p_block_dst
,
integral_constant
<
AddressSpace
,
BlockSrcAddressSpace
>
block_src_address_space
,
integral_constant
<
AddressSpace
,
BlockDstAddressSpace
>
block_dst_address_space
)
const
{
BlockSrcData
p_thread_buffer
[
GetThreadBufferSize
()];
BlockSrcData
p_thread_buffer
[
GetThreadBufferSize
()];
constexpr
auto
generic_address_space
=
RunLoadThreadBuffer
(
p_block_src
,
p_thread_buffer
);
integral_constant
<
AddressSpace
,
AddressSpace
::
generic
>
{};
RunLoadThreadBuffer
(
p_block_src
,
p_thread_buffer
,
block_src_address_space
,
generic_address_space
);
// if there is type conversion, it's done during store
// if there is type conversion, it's done during store
RunStoreThreadBuffer
(
RunStoreThreadBuffer
(
p_thread_buffer
,
p_block_dst
);
p_thread_buffer
,
p_block_dst
,
generic_address_space
,
block_dst_address_space
);
}
template
<
typename
BlockSrcData
,
typename
BlockDstData
>
__device__
void
Run
(
const
BlockSrcData
*
p_block_src
,
BlockDstData
*
p_block_dst
)
const
{
constexpr
auto
generic_address_space
=
integral_constant
<
AddressSpace
,
AddressSpace
::
generic
>
{};
Run
(
p_block_src
,
p_block_dst
,
generic_address_space
,
generic_address_space
);
}
}
template
<
typename
T
,
bool
PositiveDirection
>
template
<
typename
T
,
bool
PositiveDirection
>
...
@@ -217,7 +148,10 @@ struct BlockwiseGenericTensorSliceCopy_v4
...
@@ -217,7 +148,10 @@ struct BlockwiseGenericTensorSliceCopy_v4
SrcDimAccessOrder
,
SrcDimAccessOrder
,
SrcVectorAccessDim
,
SrcVectorAccessDim
,
SrcDataPerAccess
,
SrcDataPerAccess
,
1
>
;
1
,
SrcAddressSpace
,
ThreadBufferAddressSpace
,
InMemoryDataOperation
::
none
>
;
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v4r2
<
ThreadBufferDesc
,
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v4r2
<
ThreadBufferDesc
,
BlockDstDesc
,
BlockDstDesc
,
...
@@ -225,7 +159,10 @@ struct BlockwiseGenericTensorSliceCopy_v4
...
@@ -225,7 +159,10 @@ struct BlockwiseGenericTensorSliceCopy_v4
DstDimAccessOrder
,
DstDimAccessOrder
,
DstVectorAccessDim
,
DstVectorAccessDim
,
1
,
1
,
DstDataPerAccess
>
;
DstDataPerAccess
,
ThreadBufferAddressSpace
,
DstAddressSpace
,
DstInMemOp
>
;
ThreadwiseLoad
mThreadwiseLoad
;
ThreadwiseLoad
mThreadwiseLoad
;
ThreadwiseStore
mThreadwiseStore
;
ThreadwiseStore
mThreadwiseStore
;
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
e5874b3f
...
@@ -21,7 +21,10 @@ template <typename SrcDesc,
...
@@ -21,7 +21,10 @@ template <typename SrcDesc,
typename
DimAccessOrder
,
typename
DimAccessOrder
,
index_t
VectorAccessDim
,
index_t
VectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
index_t
DstDataPerAccess
,
AddressSpace
SrcAddressSpace
=
AddressSpace
::
generic
,
AddressSpace
DstAddressSpace
=
AddressSpace
::
generic
,
InMemoryDataOperation
DstInMemOp
=
InMemoryDataOperation
::
none
>
struct
ThreadwiseGenericTensorSliceCopy_v4r2
struct
ThreadwiseGenericTensorSliceCopy_v4r2
{
{
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
...
@@ -66,17 +69,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -66,17 +69,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// Will do padding check on src data: Read 0 if src data is in padding area.
// Will do padding check on src data: Read 0 if src data is in padding area.
// Will do padding check on dst data: No write if dst data is in paddin area.
// Will do padding check on dst data: No write if dst data is in paddin area.
template
<
typename
SrcData
,
template
<
typename
SrcData
,
typename
DstData
>
typename
DstData
,
__device__
void
Run
(
const
SrcData
*
p_src
,
DstData
*
p_dst
)
const
AddressSpace
SrcAddressSpace
,
AddressSpace
DstAddressSpace
,
bool
DoAtomicAdd
=
false
>
__device__
void
Run
(
const
SrcData
*
p_src
,
DstData
*
p_dst
,
integral_constant
<
AddressSpace
,
SrcAddressSpace
>
,
integral_constant
<
AddressSpace
,
DstAddressSpace
>
,
integral_constant
<
bool
,
DoAtomicAdd
>
do_atomic_add
=
integral_constant
<
bool
,
DoAtomicAdd
>
{})
const
{
{
using
src_vector_t
=
typename
vector_type
<
SrcData
,
SrcDataPerAccess
>::
MemoryType
;
using
src_vector_t
=
typename
vector_type
<
SrcData
,
SrcDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
DstData
,
DstDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
DstData
,
DstDataPerAccess
>::
MemoryType
;
...
@@ -123,6 +117,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -123,6 +117,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// has the same padding situation
// has the same padding situation
if
(
src_coord
.
IsUpperIndexMappedToValidOffset
())
if
(
src_coord
.
IsUpperIndexMappedToValidOffset
())
{
{
#if 0 // debug
static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto fwd) {
static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto fwd) {
#if CK_USE_AMD_BUFFER_ADDRESSING
#if CK_USE_AMD_BUFFER_ADDRESSING
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
...
@@ -137,6 +132,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -137,6 +132,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_coord
.
GetOffset
()]);
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_coord
.
GetOffset
()]);
});
});
#else
move_data
<
SrcData
,
SrcDataPerAccess
,
SrcAddressSpace
,
AddressSpace
::
vgpr
,
InMemoryDataOperation
::
none
>
(
p_src
,
src_coord
.
GetOffset
(),
p_src_long_vector
,
buffer_offset
);
#endif
}
}
}
}
...
@@ -163,6 +166,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -163,6 +166,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// has the same padding situation
// has the same padding situation
if
(
dst_coord
.
IsUpperIndexMappedToValidOffset
())
if
(
dst_coord
.
IsUpperIndexMappedToValidOffset
())
{
{
#if 0
static_if<!DoAtomicAdd>{}([&](auto) {
static_if<!DoAtomicAdd>{}([&](auto) {
static_if<DstAddressSpace == AddressSpace::global>{}([&](auto fwd) {
static_if<DstAddressSpace == AddressSpace::global>{}([&](auto fwd) {
#if CK_USE_AMD_BUFFER_ADDRESSING
#if CK_USE_AMD_BUFFER_ADDRESSING
...
@@ -185,20 +189,19 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -185,20 +189,19 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_coord
.
GetOffset
()]),
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_coord
.
GetOffset
()]),
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]));
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]));
});
});
#else
move_data
<
DstData
,
DstDataPerAccess
,
AddressSpace
::
vgpr
,
DstAddressSpace
,
DstInMemOp
>
(
p_dst_long_vector
,
buffer_offset
,
p_dst
,
dst_coord
.
GetOffset
());
#endif
}
}
}
}
});
});
}
}
template
<
typename
SrcData
,
typename
DstData
>
__device__
void
Run
(
const
SrcData
*
p_src
,
DstData
*
p_dst
)
const
{
constexpr
auto
generic_address_space
=
integral_constant
<
AddressSpace
,
AddressSpace
::
generic
>
{};
Run
(
p_src
,
p_dst
,
generic_address_space
,
generic_address_space
);
}
// Modify Length to 1, if Mask is set to false
// Modify Length to 1, if Mask is set to false
// Used for isolating linear dimension from non-linear dimensions
// Used for isolating linear dimension from non-linear dimensions
template
<
index_t
...
Lengths
,
index_t
...
Mask
>
template
<
index_t
...
Lengths
,
index_t
...
Mask
>
...
@@ -214,15 +217,9 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -214,15 +217,9 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// Will do padding check on dst data: No write if dst data is in paddin area.
// Will do padding check on dst data: No write if dst data is in paddin area.
// This version is optimized for address calculation of src tensor
// This version is optimized for address calculation of src tensor
// TODO: this function is not compiled to expected ISA
// TODO: this function is not compiled to expected ISA
template
<
typename
SrcData
,
template
<
typename
SrcData
,
typename
DstData
>
typename
DstData
,
__device__
void
Run_optimized_src_address_calculation
(
const
SrcData
*
p_src
,
AddressSpace
SrcAddressSpace
,
DstData
*
p_dst
)
const
AddressSpace
DstAddressSpace
>
__device__
void
Run_optimized_src_address_calculation
(
const
SrcData
*
p_src
,
DstData
*
p_dst
,
integral_constant
<
AddressSpace
,
SrcAddressSpace
>
,
integral_constant
<
AddressSpace
,
DstAddressSpace
>
)
const
{
{
using
src_vector_t
=
typename
vector_type
<
SrcData
,
SrcDataPerAccess
>::
MemoryType
;
using
src_vector_t
=
typename
vector_type
<
SrcData
,
SrcDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
DstData
,
DstDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
DstData
,
DstDataPerAccess
>::
MemoryType
;
...
@@ -317,6 +314,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -317,6 +314,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// the src vector has the same padding situation
// the src vector has the same padding situation
if
(
src_coord
.
IsUpperIndexMappedToValidOffset
())
if
(
src_coord
.
IsUpperIndexMappedToValidOffset
())
{
{
#if 0 // debug
static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto) {
static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto) {
#if CK_USE_AMD_BUFFER_ADDRESSING
#if CK_USE_AMD_BUFFER_ADDRESSING
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
...
@@ -332,6 +330,17 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -332,6 +330,17 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
*
reinterpret_cast
<
const
src_vector_t
*>
(
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_nonlinear_coord
.
GetOffset
()
+
src_linear_offset
]);
&
p_src
[
src_nonlinear_coord
.
GetOffset
()
+
src_linear_offset
]);
});
});
#else
move_data
<
SrcData
,
SrcDataPerAccess
,
SrcAddressSpace
,
AddressSpace
::
vgpr
,
InMemoryDataOperation
::
none
>
(
p_src
,
src_nonlinear_coord
.
GetOffset
()
+
src_linear_offset
,
p_src_long_vector
,
buffer_offset
);
#endif
}
}
}
}
...
@@ -361,8 +370,17 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -361,8 +370,17 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// the dst vector has the same padding situation
// the dst vector has the same padding situation
if
(
dst_coord
.
IsUpperIndexMappedToValidOffset
())
if
(
dst_coord
.
IsUpperIndexMappedToValidOffset
())
{
{
#if 0 // debug
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
#else
move_data
<
DstData
,
DstDataPerAccess
,
AddressSpace
::
vgpr
,
DstAddressSpace
,
DstInMemOp
>
(
p_dst_long_vector
,
buffer_offset
,
p_dst
,
dst_coord
.
GetOffset
());
#endif
}
}
}
}
});
});
...
@@ -376,15 +394,9 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -376,15 +394,9 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// Will do padding check on dst data: No write if dst data is in paddin area.
// Will do padding check on dst data: No write if dst data is in paddin area.
// This version is optimized for address calculation of dst tensor
// This version is optimized for address calculation of dst tensor
// TODO: this function is not compiled to expected ISA
// TODO: this function is not compiled to expected ISA
template
<
typename
SrcData
,
template
<
typename
SrcData
,
typename
DstData
>
typename
DstData
,
__device__
void
Run_optimized_dst_address_calculation
(
const
SrcData
*
p_src
,
AddressSpace
SrcAddressSpace
,
DstData
*
p_dst
)
const
AddressSpace
DstAddressSpace
>
__device__
void
Run_optimized_dst_address_calculation
(
const
SrcData
*
p_src
,
DstData
*
p_dst
,
integral_constant
<
AddressSpace
,
SrcAddressSpace
>
,
integral_constant
<
AddressSpace
,
DstAddressSpace
>
)
const
{
{
using
src_vector_t
=
typename
vector_type
<
SrcData
,
SrcDataPerAccess
>::
MemoryType
;
using
src_vector_t
=
typename
vector_type
<
SrcData
,
SrcDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
DstData
,
DstDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
DstData
,
DstDataPerAccess
>::
MemoryType
;
...
@@ -470,8 +482,17 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -470,8 +482,17 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// the src vector has the same padding situation
// the src vector has the same padding situation
if
(
src_coord
.
IsUpperIndexMappedToValidOffset
())
if
(
src_coord
.
IsUpperIndexMappedToValidOffset
())
{
{
#if 0
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
*reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
#else
move_data
<
SrcData
,
SrcDataPerAccess
,
SrcAddressSpace
,
AddressSpace
::
vgpr
,
InMemoryDataOperation
::
none
>
(
p_src
,
src_coord
.
GetOffset
(),
p_src_long_vector
,
buffer_offset
);
#endif
}
}
}
}
...
@@ -510,6 +531,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -510,6 +531,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// the dst vector has the same padding situation
// the dst vector has the same padding situation
if
(
dst_coord
.
IsUpperIndexMappedToValidOffset
())
if
(
dst_coord
.
IsUpperIndexMappedToValidOffset
())
{
{
#if 0
static_if<DstAddressSpace == AddressSpace::global>{}([&](auto) {
static_if<DstAddressSpace == AddressSpace::global>{}([&](auto) {
#if CK_USE_AMD_BUFFER_ADDRESSING
#if CK_USE_AMD_BUFFER_ADDRESSING
amd_intrinsic_buffer_store<DstData, DstDataPerAccess>(
amd_intrinsic_buffer_store<DstData, DstDataPerAccess>(
...
@@ -527,6 +549,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -527,6 +549,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
&
p_dst
[
dst_nonlinear_coord
.
GetOffset
()
+
dst_linear_offset
])
=
&
p_dst
[
dst_nonlinear_coord
.
GetOffset
()
+
dst_linear_offset
])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]);
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]);
});
});
#else
move_data
<
DstData
,
DstDataPerAccess
,
AddressSpace
::
vgpr
,
DstAddressSpace
,
DstInMemOp
>
(
p_dst_long_vector
,
buffer_offset
,
p_dst
,
dst_nonlinear_coord
.
GetOffset
()
+
dst_linear_offset
);
#endif
}
}
}
}
});
});
...
...
composable_kernel/include/utility/common_header.hpp
View file @
e5874b3f
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
#include "functional2.hpp"
#include "functional2.hpp"
#include "functional3.hpp"
#include "functional3.hpp"
#include "functional4.hpp"
#include "functional4.hpp"
#include "in_memory_operation.hpp"
#if CK_USE_AMD_INLINE_ASM
#if CK_USE_AMD_INLINE_ASM
#include "amd_inline_asm.hpp"
#include "amd_inline_asm.hpp"
...
...
composable_kernel/include/utility/config.amd.hpp.in
View file @
e5874b3f
...
@@ -54,7 +54,8 @@ namespace ck {
...
@@ -54,7 +54,8 @@ namespace ck {
enum AddressSpace
enum AddressSpace
{
{
generic,
generic,
global
global,
vgpr
};
};
#if CK_UNSIGNED_INDEX_TYPE
#if CK_UNSIGNED_INDEX_TYPE
...
...
composable_kernel/include/utility/config.nvidia.hpp.in
View file @
e5874b3f
...
@@ -33,7 +33,15 @@ namespace ck {
...
@@ -33,7 +33,15 @@ namespace ck {
enum AddressSpace
enum AddressSpace
{
{
generic,
generic,
global = generic
global,
lds,
vgpr
};
enum InMemoryDataOperation
{
none,
atomic_add
};
};
#if CK_UNSIGNED_INDEX_TYPE
#if CK_UNSIGNED_INDEX_TYPE
...
...
composable_kernel/include/utility/float_type.amd.hpp.in
View file @
e5874b3f
...
@@ -307,5 +307,58 @@ struct inner_product_with_conversion
...
@@ -307,5 +307,58 @@ struct inner_product_with_conversion
}
}
};
};
template <DataMovement Movement, AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
void move_data(const float* p_src,
index_t src_offset,
float* p_dst,
dst_offset,
integral_constant<DataMovement, Movement>,
integral_constant<AddressSpace, SrcAddressSpace> src_address_space,
integral_constant<AddressSpace, DstAddressSpace> dst_address_space)
{
// TODO: use static_if::ElseIf
static_if<Movement == DataMovement::copy>{}([&](auto) {
copy_data(p_src, src_offset, p_dst, dst_offset, src_address_space, dst_address_space);
});
static_if<Movement == DataMovement::atomic_add>{}([&](auto) {
atomic_add_data(p_src, src_offset, p_dst, dst_offset, src_address_space, dst_address_space);
});
}
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
void copy_data(const float* p_src,
index_t src_offset,
float* p_dst,
dst_offset,
integral_constant<AddressSpace, SrcAddressSpace>,
integral_constant<AddressSpace, DstAddressSpace>)
{
static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == AddressSpace::global>{}(
[&](auto fwd) {
#if CK_USE_AMD_BUFFER_ADDRESSING
amd_intrinsic_buffer_store(p_src[src_offset], fwd(p_dst), dst_offset, 0);
#else
p_dst[dst_offset] = p_src[src_offset];
#endif
})
.Else([&](auto) { p_dst[dst_offset] = p_src[src_offset]; });
}
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
void atomic_add_data(const float* p_src,
index_t src_offset,
float* p_dst,
dst_offset,
integral_constant<AddressSpace, SrcAddressSpace>,
integral_constant<AddressSpace, DstAddressSpace>)
{
static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == AddressSpace::global>{}(
[&](auto fwd) { atomicAdd(&(p_dst[dst_offset]), p_src[src_offset]); })
.Else([&](auto fwd) {
static_assert(fwd(false), "atomic_add doesn't support this memory space");
});
}
} // namespace ck
} // namespace ck
#endif
#endif
composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in
0 → 100644
View file @
e5874b3f
#ifndef CK_IN_MEMORY_OPERATION_NVIDIA_HPP
#define CK_IN_MEMORY_OPERATION_NVIDIA_HPP
namespace ck {
template <typename T,
index_t DataPerAccess,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace>
__device__ void copy_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
{
using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
}
template <typename T,
index_t DataPerAccess,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace>
__device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
{
using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == AddressSpace::global>{}(
[&](auto) {
atomicAdd(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
*reinterpret_cast<const vector_t*>(&p_src[src_offset]));
})
.Else([&](auto fwd) {
static_assert(fwd(false), "atomic_add doesn't support this memory space");
});
}
template <typename T,
index_t DataPerAccess,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace,
InMemoryDataOperation DstInMemOp>
__device__ void move_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
{
static_assert(DstInMemOp == InMemoryDataOperation::none ||
DstInMemOp == InMemoryDataOperation::atomic_add,
"wrong! InMemoryDataOperation not supported!");
// TODO: use static_if::ElseIf
static_if<DstInMemOp == InMemoryDataOperation::none>{}([&](auto) {
copy_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset);
});
static_if<DstInMemOp == InMemoryDataOperation::atomic_add>{}([&](auto) {
atomic_add_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset);
});
}
} // namespace ck
#endif
driver/src/conv_driver.cpp
View file @
e5874b3f
...
@@ -403,7 +403,7 @@ int main(int argc, char* argv[])
...
@@ -403,7 +403,7 @@ int main(int argc, char* argv[])
ConvStrides
{},
ConvStrides
{},
ConvDilations
{},
ConvDilations
{},
nrepeat
);
nrepeat
);
#elif
0
#elif
1
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw
(
in_nchw_desc
,
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment