Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
5b7a18c5
"web/git@developer.sourcefind.cn:OpenDAS/ollama.git" did not exist on "a3d7bb0a30b9ea07d3d4beff41e7f35c34f7f48a"
Commit
5b7a18c5
authored
Sep 18, 2019
by
Chao Liu
Browse files
experimenting global and buffer load/store
parent
c7a6545e
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
14 additions
and
14 deletions
+14
-14
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+2
-2
composable_kernel/include/utility/amd_inline_asm.hpp
composable_kernel/include/utility/amd_inline_asm.hpp
+8
-8
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+1
-1
driver/src/driver.cpp
driver/src/driver.cpp
+3
-3
No files found.
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
5b7a18c5
...
@@ -838,7 +838,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
...
@@ -838,7 +838,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
#if 1 // source code
#if 1 // source code
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_normal_offset
+
src_merged_offset
]);
&
p_src
[
src_normal_offset
+
src_merged_offset
]);
#elif
1
// inline asm using global_load
#elif
0
// inline asm using global_load
vector_data
=
__global_load
<
TData
,
SrcDataPerAccess
>
(
vector_data
=
__global_load
<
TData
,
SrcDataPerAccess
>
(
p_src
,
p_src
,
static_cast
<
uint32_t
>
(
src_merged_offset
),
static_cast
<
uint32_t
>
(
src_merged_offset
),
...
@@ -943,7 +943,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
...
@@ -943,7 +943,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
#if 1 // source code
#if 1 // source code
*
reinterpret_cast
<
dst_vector_t
*>
(
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_normal_offset
+
dst_merged_offset
])
=
vector_data
;
&
p_dst
[
dst_normal_offset
+
dst_merged_offset
])
=
vector_data
;
#elif
1
// inline asm using global_store
#elif
0
// inline asm using global_store
__global_store
<
TData
,
DstDataPerAccess
>
(
__global_store
<
TData
,
DstDataPerAccess
>
(
vector_data
,
p_dst
,
dst_merged_offset
,
dst_normal_offset
);
vector_data
,
p_dst
,
dst_merged_offset
,
dst_normal_offset
);
#elif 1 // inline asm using buffer_store
#elif 1 // inline asm using buffer_store
...
...
composable_kernel/include/utility/amd_inline_asm.hpp
View file @
5b7a18c5
...
@@ -183,8 +183,8 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
...
@@ -183,8 +183,8 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
3
]
=
0x00027000
;
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
3
]
=
0x00027000
;
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
__
buffer_load_dword %0, %1, %2, %3 offen offset:0
\n
\
buffer_load_dword %0, %1, %2, %3 offen offset:0
\n
\
s_waitcnt 0
\n
\
;;
s_waitcnt 0
\n
\
"
"
:
"=v"
(
dst
)
:
"=v"
(
dst
)
:
"v"
(
src_thread_offset
),
"s"
(
src_block_setting
),
"s"
(
src_const_offset
));
:
"v"
(
src_thread_offset
),
"s"
(
src_block_setting
),
"s"
(
src_const_offset
));
...
@@ -208,8 +208,8 @@ __device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(const float
...
@@ -208,8 +208,8 @@ __device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(const float
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
3
]
=
0x00027000
;
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
3
]
=
0x00027000
;
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
__
buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0
\n
\
buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0
\n
\
s_waitcnt 0
\n
\
;;
s_waitcnt 0
\n
\
"
"
:
"=v"
(
dst
)
:
"=v"
(
dst
)
:
"v"
(
src_thread_offset
),
"s"
(
src_block_setting
),
"s"
(
src_const_offset
));
:
"v"
(
src_thread_offset
),
"s"
(
src_block_setting
),
"s"
(
src_const_offset
));
...
@@ -233,8 +233,8 @@ __device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(const float
...
@@ -233,8 +233,8 @@ __device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(const float
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
3
]
=
0x00027000
;
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
3
]
=
0x00027000
;
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
__
buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0
\n
\
buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0
\n
\
s_waitcnt 0
\n
\
;;
s_waitcnt 0
\n
\
"
"
:
"=v"
(
dst
)
:
"=v"
(
dst
)
:
"v"
(
src_thread_offset
),
"s"
(
src_block_setting
),
"s"
(
src_const_offset
));
:
"v"
(
src_thread_offset
),
"s"
(
src_block_setting
),
"s"
(
src_const_offset
));
...
@@ -257,8 +257,8 @@ __device__ void __buffer_store<float, 1>(const float& src,
...
@@ -257,8 +257,8 @@ __device__ void __buffer_store<float, 1>(const float& src,
reinterpret_cast
<
int
*>
(
&
dst_block_setting
)[
3
]
=
0x00027000
;
reinterpret_cast
<
int
*>
(
&
dst_block_setting
)[
3
]
=
0x00027000
;
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
__
buffer_store_dword %1, %2, %0, %3 offen offset:0
\n
\
buffer_store_dword %1, %2, %0, %3 offen offset:0
\n
\
s_waitcnt 0
\n
\
;;
s_waitcnt 0
\n
\
"
"
:
:
:
"s"
(
dst_block_setting
),
"v"
(
src
),
"v"
(
dst_thread_offset
),
"s"
(
dst_const_offset
));
:
"s"
(
dst_block_setting
),
"v"
(
src
),
"v"
(
dst_thread_offset
),
"s"
(
dst_const_offset
));
...
...
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
5b7a18c5
...
@@ -47,7 +47,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -47,7 +47,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if
0
#if
1
// BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
// BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
...
...
driver/src/driver.cpp
View file @
5b7a18c5
...
@@ -103,7 +103,7 @@ int main(int argc, char* argv[])
...
@@ -103,7 +103,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
1
#elif
0
// 1x1 filter, 8x8 image
// 1x1 filter, 8x8 image
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
constexpr
index_t
N
=
64
;
constexpr
index_t
N
=
64
;
...
@@ -295,7 +295,7 @@ int main(int argc, char* argv[])
...
@@ -295,7 +295,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
0
#elif
1
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
...
@@ -341,7 +341,7 @@ int main(int argc, char* argv[])
...
@@ -341,7 +341,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
3
,
0
>
;
using
LeftPads
=
Sequence
<
3
,
0
>
;
using
RightPads
=
Sequence
<
3
,
0
>
;
using
RightPads
=
Sequence
<
3
,
0
>
;
#elif
1
#elif
0
// 1x7 filter, 0x3 pad, 17x17 input
// 1x7 filter, 0x3 pad, 17x17 input
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
128
;
constexpr
index_t
C
=
128
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment