Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
c7a6545e
Commit
c7a6545e
authored
Sep 18, 2019
by
Chao Liu
Browse files
experimenting global and buffer load/store
parent
9f46cdf5
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
223 additions
and
80 deletions
+223
-80
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+5
-8
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+42
-44
composable_kernel/include/utility/amd_inline_asm.hpp
composable_kernel/include/utility/amd_inline_asm.hpp
+171
-23
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+3
-3
driver/src/driver.cpp
driver/src/driver.cpp
+2
-2
No files found.
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
c7a6545e
...
@@ -487,6 +487,7 @@ struct BlockwiseGenericTensorSliceCopy_v2
...
@@ -487,6 +487,7 @@ struct BlockwiseGenericTensorSliceCopy_v2
#if 0
#if 0
mThreadwiseLoad.Run(p_src, p_buffer);
mThreadwiseLoad.Run(p_src, p_buffer);
#else
#else
// hardcoded: global to register
mThreadwiseLoad
.
template
Run_amd_experiment
<
TData
,
2
,
0
>(
p_src
,
p_buffer
);
mThreadwiseLoad
.
template
Run_amd_experiment
<
TData
,
2
,
0
>(
p_src
,
p_buffer
);
#endif
#endif
}
}
...
@@ -497,7 +498,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
...
@@ -497,7 +498,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
#if 0
#if 0
mThreadwiseStore.Run(p_buffer, p_dst);
mThreadwiseStore.Run(p_buffer, p_dst);
#else
#else
mThreadwiseStore
.
template
Run_amd_experiment
<
TData
,
0
,
2
>(
p_buffer
,
p_dst
);
// hardcoded: register to LDS
mThreadwiseStore
.
template
Run_amd_experiment
<
TData
,
0
,
1
>(
p_buffer
,
p_dst
);
#endif
#endif
}
}
...
@@ -506,13 +508,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
...
@@ -506,13 +508,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
{
{
TData
p_buffer
[
GetRegisterBufferSize
()];
TData
p_buffer
[
GetRegisterBufferSize
()];
#if 0
RunLoadRegisterBuffer
(
p_src
,
p_buffer
);
mThreadwiseLoad.Run(p_src, p_buffer);
RunStoreRegisterBuffer
(
p_buffer
,
p_dst
);
mThreadwiseStore.Run(p_buffer, p_dst);
#else
mThreadwiseLoad
.
template
Run_amd_experiment
<
TData
,
2
,
0
>(
p_src
,
p_buffer
);
mThreadwiseStore
.
template
Run_amd_experiment
<
TData
,
0
,
2
>(
p_buffer
,
p_dst
);
#endif
}
}
template
<
typename
T
,
bool
PositiveDirection
>
template
<
typename
T
,
bool
PositiveDirection
>
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
c7a6545e
...
@@ -819,38 +819,38 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
...
@@ -819,38 +819,38 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
src_vector_t
vector_data
;
src_vector_t
vector_data
;
// Read vector from src.
// 1. Source code version can take src of all kinds of memory-space
// 2. Inline asm versions using global_load or buffer_load can only take
// src from global-memory
//
// Commemt for loading from global-memory:
// When
// 1) using source code, in order for compiler to emit optimal
// load instruction, or
// 2) using inline asm (global_load or buffer_load), in order
// for inline asm to be valid,
// following assumptions need to be satisfied:
// 1. p_src need to be block-invariant (assumption)
// 2. src_normal_offset must be calculatd at compile time (guaranteed)
// 3. src_merged_offset can be runtime value (no assumption imposed)
static_if
<
SrcMemorySpace
==
2
>
{}([
&
](
auto
)
{
static_if
<
SrcMemorySpace
==
2
>
{}([
&
](
auto
)
{
#if 1 // source code
#if 1 // source code
// Load vector from src.
// src can be all kinds of memory-space.
// In order for optimized global_load to be emitted by compiler, need to
// assume:
// 1. p_src need to be block-invariant (assumption)
// 2. src_normal_offset must be calculatd at compile time (guaranteed)
// 3. src_merged_offset can be runtime value (no assumption imposed)
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_normal_offset
+
src_merged_offset
]);
&
p_src
[
src_normal_offset
+
src_merged_offset
]);
#else // inline asm using buffer_load
#elif 1 // inline asm using global_load
// Load vector from src
vector_data
=
__global_load
<
TData
,
SrcDataPerAccess
>
(
// src's memory-space can only be global-memory (buffer_load inline-asm is
p_src
,
// used)
static_cast
<
uint32_t
>
(
src_merged_offset
),
// In order for buffer_load to be valid, need to assume:
static_cast
<
uint32_t
>
(
src_normal_offset
));
// 1. p_src need to be block-invariant (assumption)
#elif 1 // inline asm using buffer_load
// 2. src_normal_offset must be calculatd at compile time (guaranteed)
vector_data
=
__buffer_load
<
TData
,
SrcDataPerAccess
>
(
// 3. src_merged_offset can be runtime value (no assumption imposed)
vector_data
=
buffer_load
<
TData
,
SrcDataPerAccess
>
(
p_src
,
p_src
,
static_cast
<
uint32_t
>
(
src_merged_offset
),
static_cast
<
uint32_t
>
(
src_merged_offset
),
static_cast
<
uint32_t
>
(
src_normal_offset
));
static_cast
<
uint32_t
>
(
src_normal_offset
));
#endif
#endif
}).
Else
([
&
](
auto
)
{
}).
Else
([
&
](
auto
)
{
// Load vector from src.
// src can be all kinds of memory-space.
// src can be all kinds of memory-space.
// In order for optimized global_load to be emitted by compiler, need to
// assume:
// 1. p_src need to be block-invariant (assumption)
// 2. src_normal_offset must be calculatd at compile time (guaranteed)
// 3. src_merged_offset can be runtime value (no assumption imposed)
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_normal_offset
+
src_merged_offset
]);
&
p_src
[
src_normal_offset
+
src_merged_offset
]);
});
});
...
@@ -924,36 +924,34 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
...
@@ -924,36 +924,34 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
const
index_t
dst_normal_offset
=
const
index_t
dst_normal_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_normal_dim_data_id
);
DstDesc
::
GetOffsetFromMultiIndex
(
dst_normal_dim_data_id
);
// Write vector into dst.
// 1. Source code version can take dst of all kinds of memory-space
// 2. Inline asm versions using global_store or buffer_store can only take
// dst from global-memory
//
// Commemt for storing into global-memory:
// When
// 1) using source code, in order for compiler to emit optimal
// store instruction, or
// 2) using inline asm (global_store or buffer_store), in order
// for inline asm to be valid,
// following assumptions need to be satisfied:
// 1. p_dst need to be block-invariant (assumption)
// 2. dst_normal_offset must be calculatd at compile time (guaranteed)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
static_if
<
DstMemorySpace
==
2
>
{}([
&
](
auto
)
{
static_if
<
DstMemorySpace
==
2
>
{}([
&
](
auto
)
{
#if 1 // source code
#if 1 // source code
// Write vector into dst.
// dst can be all kinds of memory-space
// In order for optmized global_store to be emitted by compiler, need to
// assume:
// 1. p_dst need to be block-invariant (assumption)
// 2. dst_normal_offset must be calculatd at compile time (guaranteed)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
*
reinterpret_cast
<
dst_vector_t
*>
(
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_normal_offset
+
dst_merged_offset
])
=
vector_data
;
&
p_dst
[
dst_normal_offset
+
dst_merged_offset
])
=
vector_data
;
#else // inline asm using buffer_store
#elif 1 // inline asm using global_store
// Write vector into dst.
__global_store
<
TData
,
DstDataPerAccess
>
(
// dst's memory-space need to be global-memory (buffer_store is used)
vector_data
,
p_dst
,
dst_merged_offset
,
dst_normal_offset
);
// In order for optmized global_store to be emitted by compiler, need to
#elif 1 // inline asm using buffer_store
// assume:
__buffer_store
<
TData
,
DstDataPerAccess
>
(
// 1. p_dst need to be block-invariant (assumption)
// 2. dst_normal_offset must be calculatd at compile time (guaranteed)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
buffer_store
<
TData
,
DstDataPerAccess
>
(
vector_data
,
p_dst
,
dst_merged_offset
,
dst_normal_offset
);
vector_data
,
p_dst
,
dst_merged_offset
,
dst_normal_offset
);
#endif
#endif
}).
Else
([
&
](
auto
)
{
}).
Else
([
&
](
auto
)
{
// Write vector into dst.
// dst can be all kinds of memory-space
// dst can be all kinds of memory-space
// In order for optmized global_store to be emitted by compiler, need to
// assume:
// 1. p_dst need to be block-invariant (assumption)
// 2. dst_normal_offset must be calculatd at compile time (guaranteed)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
*
reinterpret_cast
<
dst_vector_t
*>
(
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_normal_offset
+
dst_merged_offset
])
=
vector_data
;
&
p_dst
[
dst_normal_offset
+
dst_merged_offset
])
=
vector_data
;
});
});
...
...
composable_kernel/include/utility/amd_inline_asm.hpp
View file @
c7a6545e
...
@@ -8,21 +8,169 @@ namespace ck {
...
@@ -8,21 +8,169 @@ namespace ck {
// cast a pointer of LDS to its address
// cast a pointer of LDS to its address
extern
"C"
__attribute__
((
address_space
(
3
)))
__device__
void
*
__to_local
(
void
*
p
);
extern
"C"
__attribute__
((
address_space
(
3
)))
__device__
void
*
__to_local
(
void
*
p
);
//
buffer
_load and
buffer
_store
//
global
_load and
global
_store
template
<
typename
T
,
index_t
VectorSize
>
template
<
typename
T
,
index_t
VectorSize
>
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
buffer
_load
(
const
T
*
p_src_block
,
uint32_t
src_thread_offset
,
uint32_t
src_const_offset
);
__global
_load
(
const
T
*
p_src_block
,
uint32_t
src_thread_offset
,
uint32_t
src_const_offset
);
template
<
typename
T
,
index_t
VectorSize
>
template
<
typename
T
,
index_t
VectorSize
>
__device__
void
buffer
_store
(
const
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
&
src
,
__device__
void
__global
_store
(
const
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
&
src
,
T
*
p_dst_block
,
T
*
p_dst_block
,
uint32_t
dst_thread_offset
,
uint32_t
dst_thread_offset
,
uint32_t
dst_const_offset
);
uint32_t
dst_const_offset
);
template
<
>
template
<
>
__device__
float
buffer_load
<
float
,
1
>
(
const
float
*
p_src_block
,
__device__
float
__global_load
<
float
,
1
>
(
const
float
*
p_src_block
,
uint32_t
src_thread_offset
,
uint32_t
src_thread_offset
,
uint32_t
src_const_offset
)
uint32_t
src_const_offset
)
{
#if 0 // compute on VALU
float dst;
uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset + src_const_offset);
asm volatile("\n \
global_load_dword %0, %1, %2, offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_offset_u64), "s"(p_src_block));
return dst;
#else
// compute on SALU
float
dst
;
uint64_t
src_thread_offset_u64
=
static_cast
<
uint64_t
>
(
src_thread_offset
);
const
float
*
p_src_block_with_offset
=
p_src_block
+
src_const_offset
;
asm
volatile
(
"
\n
\
global_load_dword %0, %1, %2, offset:0
\n
\
;;s_waitcnt 0
\n
\
"
:
"=v"
(
dst
)
:
"v"
(
src_thread_offset_u64
),
"s"
(
p_src_block_with_offset
));
return
dst
;
#endif
}
template
<
>
__device__
vector_type
<
float
,
2
>::
MemoryType
__global_load
<
float
,
2
>
(
const
float
*
p_src_block
,
uint32_t
src_thread_offset
,
uint32_t
src_const_offset
)
{
#if 0 // compute on VALU
vector_type<float, 2>::MemoryType dst;
uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset + src_const_offset);
asm volatile("\n \
global_load_dwordx2 %0, %1, %2, offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_offset_u64), "s"(p_src_block));
return dst;
#else
// compute on SALU
vector_type
<
float
,
2
>::
MemoryType
dst
;
uint64_t
src_thread_offset_u64
=
static_cast
<
uint64_t
>
(
src_thread_offset
);
const
float
*
p_src_block_with_offset
=
p_src_block
+
src_const_offset
;
asm
volatile
(
"
\n
\
global_load_dwordx2 %0, %1, %2, offset:0
\n
\
;;s_waitcnt 0
\n
\
"
:
"=v"
(
dst
)
:
"v"
(
src_thread_offset_u64
),
"s"
(
p_src_block_with_offset
));
return
dst
;
#endif
}
template
<
>
__device__
vector_type
<
float
,
4
>::
MemoryType
__global_load
<
float
,
4
>
(
const
float
*
p_src_block
,
uint32_t
src_thread_offset
,
uint32_t
src_const_offset
)
{
#if 0 // compute on VALU
vector_type<float, 4>::MemoryType dst;
uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset + src_const_offset);
asm volatile("\n \
global_load_dwordx4 %0, %1, %2, offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_offset_u64), "s"(p_src_block));
return dst;
#else
// compute on SALU
vector_type
<
float
,
4
>::
MemoryType
dst
;
uint64_t
src_thread_offset_u64
=
static_cast
<
uint64_t
>
(
src_thread_offset
);
const
float
*
p_src_block_with_offset
=
p_src_block
+
src_const_offset
;
asm
volatile
(
"
\n
\
global_load_dwordx4 %0, %1, %2, offset:0
\n
\
;;s_waitcnt 0
\n
\
"
:
"=v"
(
dst
)
:
"v"
(
src_thread_offset_u64
),
"s"
(
p_src_block_with_offset
));
return
dst
;
#endif
}
template
<
>
__device__
void
__global_store
<
float
,
1
>
(
const
float
&
src
,
float
*
p_dst_block
,
uint32_t
dst_thread_offset
,
uint32_t
dst_const_offset
)
{
#if 0 // compute on VALU
uint64_t dst_thread_offset_u64 = static_cast<uint64_t>(dst_thread_offset + dst_const_offset);
asm volatile("\n \
global_store_dword %0, %1, %2, offset:0 \n \
s_waitcnt 0 \n \
"
:
: "v"(dst_thread_offset_u64), "v"(src), "s"(p_dst_block));
#else
// compute on SALU
uint64_t
dst_thread_offset_u64
=
static_cast
<
uint64_t
>
(
dst_thread_offset
);
float
*
p_dst_block_with_offset
=
p_dst_block
+
dst_const_offset
;
asm
volatile
(
"
\n
\
global_store_dword %0, %1, %2, offset:0
\n
\
;;s_waitcnt 0
\n
\
"
:
:
"v"
(
dst_thread_offset_u64
),
"v"
(
src
),
"s"
(
p_dst_block_with_offset
));
#endif
}
// __buffer_load and __buffer_store
template
<
typename
T
,
index_t
VectorSize
>
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
__buffer_load
(
const
T
*
p_src_block
,
uint32_t
src_thread_offset
,
uint32_t
src_const_offset
);
template
<
typename
T
,
index_t
VectorSize
>
__device__
void
__buffer_store
(
const
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
&
src
,
T
*
p_dst_block
,
uint32_t
dst_thread_offset
,
uint32_t
dst_const_offset
);
template
<
>
__device__
float
__buffer_load
<
float
,
1
>
(
const
float
*
p_src_block
,
uint32_t
src_thread_offset
,
uint32_t
src_const_offset
)
{
{
float
dst
;
float
dst
;
...
@@ -35,7 +183,7 @@ __device__ float buffer_load<float, 1>(const float* p_src_block,
...
@@ -35,7 +183,7 @@ __device__ float buffer_load<float, 1>(const float* p_src_block,
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
3
]
=
0x00027000
;
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
3
]
=
0x00027000
;
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
buffer_load_dword %0, %1, %2, %3 offen offset:0
\n
\
__
buffer_load_dword %0, %1, %2, %3 offen offset:0
\n
\
s_waitcnt 0
\n
\
s_waitcnt 0
\n
\
"
"
:
"=v"
(
dst
)
:
"=v"
(
dst
)
...
@@ -45,9 +193,9 @@ __device__ float buffer_load<float, 1>(const float* p_src_block,
...
@@ -45,9 +193,9 @@ __device__ float buffer_load<float, 1>(const float* p_src_block,
}
}
template
<
>
template
<
>
__device__
vector_type
<
float
,
2
>::
MemoryType
buffer_load
<
float
,
2
>
(
const
float
*
p_src_block
,
__device__
vector_type
<
float
,
2
>::
MemoryType
__
buffer_load
<
float
,
2
>
(
const
float
*
p_src_block
,
uint32_t
src_thread_offset
,
uint32_t
src_thread_offset
,
uint32_t
src_const_offset
)
uint32_t
src_const_offset
)
{
{
vector_type
<
float
,
2
>::
MemoryType
dst
;
vector_type
<
float
,
2
>::
MemoryType
dst
;
...
@@ -60,7 +208,7 @@ __device__ vector_type<float, 2>::MemoryType buffer_load<float, 2>(const float*
...
@@ -60,7 +208,7 @@ __device__ vector_type<float, 2>::MemoryType buffer_load<float, 2>(const float*
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
3
]
=
0x00027000
;
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
3
]
=
0x00027000
;
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0
\n
\
__
buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0
\n
\
s_waitcnt 0
\n
\
s_waitcnt 0
\n
\
"
"
:
"=v"
(
dst
)
:
"=v"
(
dst
)
...
@@ -70,9 +218,9 @@ __device__ vector_type<float, 2>::MemoryType buffer_load<float, 2>(const float*
...
@@ -70,9 +218,9 @@ __device__ vector_type<float, 2>::MemoryType buffer_load<float, 2>(const float*
}
}
template
<
>
template
<
>
__device__
vector_type
<
float
,
4
>::
MemoryType
buffer_load
<
float
,
4
>
(
const
float
*
p_src_block
,
__device__
vector_type
<
float
,
4
>::
MemoryType
__
buffer_load
<
float
,
4
>
(
const
float
*
p_src_block
,
uint32_t
src_thread_offset
,
uint32_t
src_thread_offset
,
uint32_t
src_const_offset
)
uint32_t
src_const_offset
)
{
{
vector_type
<
float
,
4
>::
MemoryType
dst
;
vector_type
<
float
,
4
>::
MemoryType
dst
;
...
@@ -85,7 +233,7 @@ __device__ vector_type<float, 4>::MemoryType buffer_load<float, 4>(const float*
...
@@ -85,7 +233,7 @@ __device__ vector_type<float, 4>::MemoryType buffer_load<float, 4>(const float*
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
3
]
=
0x00027000
;
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
3
]
=
0x00027000
;
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0
\n
\
__
buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0
\n
\
s_waitcnt 0
\n
\
s_waitcnt 0
\n
\
"
"
:
"=v"
(
dst
)
:
"=v"
(
dst
)
...
@@ -95,10 +243,10 @@ __device__ vector_type<float, 4>::MemoryType buffer_load<float, 4>(const float*
...
@@ -95,10 +243,10 @@ __device__ vector_type<float, 4>::MemoryType buffer_load<float, 4>(const float*
}
}
template
<
>
template
<
>
__device__
void
buffer_store
<
float
,
1
>
(
const
float
&
src
,
__device__
void
__
buffer_store
<
float
,
1
>
(
const
float
&
src
,
float
*
p_dst_block
,
float
*
p_dst_block
,
uint32_t
dst_thread_offset
,
uint32_t
dst_thread_offset
,
uint32_t
dst_const_offset
)
uint32_t
dst_const_offset
)
{
{
int32x4_t
dst_block_setting
{
0
};
int32x4_t
dst_block_setting
{
0
};
// fill in byte 0 - 1
// fill in byte 0 - 1
...
@@ -109,7 +257,7 @@ __device__ void buffer_store<float, 1>(const float& src,
...
@@ -109,7 +257,7 @@ __device__ void buffer_store<float, 1>(const float& src,
reinterpret_cast
<
int
*>
(
&
dst_block_setting
)[
3
]
=
0x00027000
;
reinterpret_cast
<
int
*>
(
&
dst_block_setting
)[
3
]
=
0x00027000
;
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
buffer_store_dword %1, %2, %0, %3 offen offset:0
\n
\
__
buffer_store_dword %1, %2, %0, %3 offen offset:0
\n
\
s_waitcnt 0
\n
\
s_waitcnt 0
\n
\
"
"
:
:
...
...
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
c7a6545e
...
@@ -47,7 +47,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -47,7 +47,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if
1
#if
0
// BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
// BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
constexpr index_t BlockSize = 256;
constexpr index_t BlockSize = 256;
...
@@ -82,7 +82,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -82,7 +82,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K]
using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K]
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
1
;
constexpr index_t WeiBlockCopySrcDataPerRead_E =
4
;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
#elif
0
#elif
0
// BlockSize = 64, blockwise-GEMM 64x64, each thread hold 64 data
// BlockSize = 64, blockwise-GEMM 64x64, each thread hold 64 data
...
@@ -156,7 +156,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -156,7 +156,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
1
;
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
2
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
#endif
#endif
...
...
driver/src/driver.cpp
View file @
c7a6545e
...
@@ -103,7 +103,7 @@ int main(int argc, char* argv[])
...
@@ -103,7 +103,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
0
#elif
1
// 1x1 filter, 8x8 image
// 1x1 filter, 8x8 image
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
constexpr
index_t
N
=
64
;
constexpr
index_t
N
=
64
;
...
@@ -295,7 +295,7 @@ int main(int argc, char* argv[])
...
@@ -295,7 +295,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
1
#elif
0
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment