Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
b6e1c52a
"...resnet50_tensorflow.git" did not exist on "eeb00c02591d2afab2a758bf0670a8dd52398102"
Commit
b6e1c52a
authored
Sep 19, 2019
by
Chao Liu
Browse files
use buffer_load buffer_store intrinsic
parent
8afbb10d
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
207 additions
and
4 deletions
+207
-4
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+2
-2
composable_kernel/include/utility/amd_inline_asm.hpp
composable_kernel/include/utility/amd_inline_asm.hpp
+204
-1
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+1
-1
No files found.
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
b6e1c52a
...
@@ -835,7 +835,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
...
@@ -835,7 +835,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
// 2. src_normal_offset must be calculatd at compile time (guaranteed)
// 2. src_normal_offset must be calculatd at compile time (guaranteed)
// 3. src_merged_offset can be runtime value (no assumption imposed)
// 3. src_merged_offset can be runtime value (no assumption imposed)
static_if
<
SrcMemorySpace
==
2
>
{}([
&
](
auto
)
{
static_if
<
SrcMemorySpace
==
2
>
{}([
&
](
auto
)
{
#if
1
// source code
#if
0
// source code
vector_data = *reinterpret_cast<const src_vector_t*>(
vector_data = *reinterpret_cast<const src_vector_t*>(
&p_src[src_normal_offset + src_merged_offset]);
&p_src[src_normal_offset + src_merged_offset]);
#elif
0
// inline asm using global_load
#elif
0
// inline asm using global_load
...
@@ -940,7 +940,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
...
@@ -940,7 +940,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
// 2. dst_normal_offset must be calculatd at compile time (guaranteed)
// 2. dst_normal_offset must be calculatd at compile time (guaranteed)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
static_if
<
DstMemorySpace
==
2
>
{}([
&
](
auto
)
{
static_if
<
DstMemorySpace
==
2
>
{}([
&
](
auto
)
{
#if
1
// source code
#if
0
// source code
*reinterpret_cast<dst_vector_t*>(
*reinterpret_cast<dst_vector_t*>(
&p_dst[dst_normal_offset + dst_merged_offset]) = vector_data;
&p_dst[dst_normal_offset + dst_merged_offset]) = vector_data;
#elif
0
// inline asm using global_store
#elif
0
// inline asm using global_store
...
...
composable_kernel/include/utility/amd_inline_asm.hpp
View file @
b6e1c52a
...
@@ -8,6 +8,47 @@ namespace ck {
...
@@ -8,6 +8,47 @@ namespace ck {
// cast a pointer of LDS to its address
// cast a pointer of LDS to its address
extern
"C"
__attribute__
((
address_space
(
3
)))
__device__
void
*
__to_local
(
void
*
p
);
extern
"C"
__attribute__
((
address_space
(
3
)))
__device__
void
*
__to_local
(
void
*
p
);
__device__
float
__llvm_amdgcn_buffer_load
(
int32x4_t
rsrc
,
uint32_t
vindex
,
uint32_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load"
);
__device__
vector_type
<
float
,
2
>::
MemoryType
__llvm_amdgcn_buffer_loadx2
(
int32x4_t
rsrc
,
uint32_t
vindex
,
uint32_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.dwordx2"
);
__device__
vector_type
<
float
,
4
>::
MemoryType
__llvm_amdgcn_buffer_loadx4
(
int32x4_t
rsrc
,
uint32_t
vindex
,
uint32_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.dwordx4"
);
__device__
void
__llvm_amdgcn_buffer_store
(
float
vdata
,
int32x4_t
rsrc
,
uint32_t
vindex
,
uint32_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store"
);
__device__
void
__llvm_amdgcn_buffer_storex2
(
vector_type
<
float
,
2
>::
MemoryType
vdata
,
int32x4_t
rsrc
,
uint32_t
vindex
,
uint32_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.dwordx2"
);
__device__
void
__llvm_amdgcn_buffer_storex4
(
vector_type
<
float
,
4
>::
MemoryType
vdata
,
int32x4_t
rsrc
,
uint32_t
vindex
,
uint32_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.dwordx4"
);
// global_load and global_store
// global_load and global_store
template
<
typename
T
,
index_t
VectorSize
>
template
<
typename
T
,
index_t
VectorSize
>
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
__global_load
(
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
__global_load
(
...
@@ -186,7 +227,7 @@ __device__ void __global_store<float, 1>(const float& src,
...
@@ -186,7 +227,7 @@ __device__ void __global_store<float, 1>(const float& src,
#endif
#endif
}
}
//
__
buffer_load and
__
buffer_store
// buffer_load and buffer_store
template
<
typename
T
,
index_t
VectorSize
>
template
<
typename
T
,
index_t
VectorSize
>
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
__buffer_load
(
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
__buffer_load
(
const
T
*
p_src_block
,
uint32_t
src_thread_data_offset
,
uint32_t
src_const_data_offset
);
const
T
*
p_src_block
,
uint32_t
src_thread_data_offset
,
uint32_t
src_const_data_offset
);
...
@@ -202,6 +243,7 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
...
@@ -202,6 +243,7 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
uint32_t
src_thread_data_offset
,
uint32_t
src_thread_data_offset
,
uint32_t
src_const_data_offset
)
uint32_t
src_const_data_offset
)
{
{
#if 0
float dst;
float dst;
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
...
@@ -223,12 +265,32 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
...
@@ -223,12 +265,32 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
: "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
: "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
return dst;
return dst;
#else
float
dst
;
uint32_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
uint32_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
float
);
int32x4_t
src_block_setting
{
0
};
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
src_block_setting
)
=
const_cast
<
float
*>
(
p_src_block
);
// fill in byte 2
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
2
]
=
-
1
;
// fill in byte 3
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
3
]
=
0x00027000
;
dst
=
__llvm_amdgcn_buffer_load
(
src_block_setting
,
0
,
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
return
dst
;
#endif
}
}
template
<
>
template
<
>
__device__
vector_type
<
float
,
2
>::
MemoryType
__buffer_load
<
float
,
2
>
(
__device__
vector_type
<
float
,
2
>::
MemoryType
__buffer_load
<
float
,
2
>
(
const
float
*
p_src_block
,
uint32_t
src_thread_data_offset
,
uint32_t
src_const_data_offset
)
const
float
*
p_src_block
,
uint32_t
src_thread_data_offset
,
uint32_t
src_const_data_offset
)
{
{
#if 0
vector_type<float, 2>::MemoryType dst;
vector_type<float, 2>::MemoryType dst;
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
...
@@ -250,12 +312,32 @@ __device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(
...
@@ -250,12 +312,32 @@ __device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(
: "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
: "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
return dst;
return dst;
#else
vector_type
<
float
,
2
>::
MemoryType
dst
;
uint32_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
uint32_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
float
);
int32x4_t
src_block_setting
{
0
};
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
src_block_setting
)
=
const_cast
<
float
*>
(
p_src_block
);
// fill in byte 2
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
2
]
=
-
1
;
// fill in byte 3
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
3
]
=
0x00027000
;
dst
=
__llvm_amdgcn_buffer_loadx2
(
src_block_setting
,
0
,
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
return
dst
;
#endif
}
}
template
<
>
template
<
>
__device__
vector_type
<
float
,
4
>::
MemoryType
__buffer_load
<
float
,
4
>
(
__device__
vector_type
<
float
,
4
>::
MemoryType
__buffer_load
<
float
,
4
>
(
const
float
*
p_src_block
,
uint32_t
src_thread_data_offset
,
uint32_t
src_const_data_offset
)
const
float
*
p_src_block
,
uint32_t
src_thread_data_offset
,
uint32_t
src_const_data_offset
)
{
{
#if 0
vector_type<float, 4>::MemoryType dst;
vector_type<float, 4>::MemoryType dst;
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
...
@@ -277,6 +359,25 @@ __device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(
...
@@ -277,6 +359,25 @@ __device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(
: "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
: "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
return dst;
return dst;
#elif
1
vector_type
<
float
,
4
>::
MemoryType
dst
;
uint32_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
uint32_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
float
);
int32x4_t
src_block_setting
{
0
};
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
src_block_setting
)
=
const_cast
<
float
*>
(
p_src_block
);
// fill in byte 2
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
2
]
=
-
1
;
// fill in byte 3
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
3
]
=
0x00027000
;
dst
=
__llvm_amdgcn_buffer_loadx4
(
src_block_setting
,
0
,
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
return
dst
;
#endif
}
}
template
<
>
template
<
>
...
@@ -285,6 +386,7 @@ __device__ void __buffer_store<float, 1>(const float& src,
...
@@ -285,6 +386,7 @@ __device__ void __buffer_store<float, 1>(const float& src,
uint32_t
dst_thread_data_offset
,
uint32_t
dst_thread_data_offset
,
uint32_t
dst_const_data_offset
)
uint32_t
dst_const_data_offset
)
{
{
#if 0
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
...
@@ -304,6 +406,107 @@ __device__ void __buffer_store<float, 1>(const float& src,
...
@@ -304,6 +406,107 @@ __device__ void __buffer_store<float, 1>(const float& src,
"v"(src),
"v"(src),
"v"(dst_thread_addr_offset),
"v"(dst_thread_addr_offset),
"s"(dst_const_addr_offset));
"s"(dst_const_addr_offset));
#else
uint32_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
uint32_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
int32x4_t
dst_block_setting
{
0
};
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
dst_block_setting
)
=
p_dst_block
;
// fill in byte 2
reinterpret_cast
<
int
*>
(
&
dst_block_setting
)[
2
]
=
-
1
;
// fill in byte 3
reinterpret_cast
<
int
*>
(
&
dst_block_setting
)[
3
]
=
0x00027000
;
__llvm_amdgcn_buffer_store
(
src
,
dst_block_setting
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#endif
}
template
<
>
__device__
void
__buffer_store
<
float
,
2
>
(
const
vector_type
<
float
,
2
>::
MemoryType
&
src
,
float
*
p_dst_block
,
uint32_t
dst_thread_data_offset
,
uint32_t
dst_const_data_offset
)
{
#if 0
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
int32x4_t dst_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
// fill in byte 2
reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_store_dwordx2 %1, %2, %0, %3 offen offset:0 \n \
"
:
: "s"(dst_block_setting),
"v"(src),
"v"(dst_thread_addr_offset),
"s"(dst_const_addr_offset));
#else
uint32_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
uint32_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
int32x4_t
dst_block_setting
{
0
};
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
dst_block_setting
)
=
p_dst_block
;
// fill in byte 2
reinterpret_cast
<
int
*>
(
&
dst_block_setting
)[
2
]
=
-
1
;
// fill in byte 3
reinterpret_cast
<
int
*>
(
&
dst_block_setting
)[
3
]
=
0x00027000
;
__llvm_amdgcn_buffer_storex2
(
src
,
dst_block_setting
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#endif
}
template
<
>
__device__
void
__buffer_store
<
float
,
4
>
(
const
vector_type
<
float
,
4
>::
MemoryType
&
src
,
float
*
p_dst_block
,
uint32_t
dst_thread_data_offset
,
uint32_t
dst_const_data_offset
)
{
#if 0
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
int32x4_t dst_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
// fill in byte 2
reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_store_dwordx4 %1, %2, %0, %3 offen offset:0 \n \
"
:
: "s"(dst_block_setting),
"v"(src),
"v"(dst_thread_addr_offset),
"s"(dst_const_addr_offset));
#else
uint32_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
uint32_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
int32x4_t
dst_block_setting
{
0
};
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
dst_block_setting
)
=
p_dst_block
;
// fill in byte 2
reinterpret_cast
<
int
*>
(
&
dst_block_setting
)[
2
]
=
-
1
;
// fill in byte 3
reinterpret_cast
<
int
*>
(
&
dst_block_setting
)[
3
]
=
0x00027000
;
__llvm_amdgcn_buffer_storex4
(
src
,
dst_block_setting
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#endif
}
}
__device__
void
vmcnt
(
index_t
cnt
)
__device__
void
vmcnt
(
index_t
cnt
)
...
...
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
b6e1c52a
...
@@ -47,7 +47,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -47,7 +47,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if
0
#if
1
// BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
// BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment