Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
dabfa77f
Commit
dabfa77f
authored
Apr 05, 2019
by
Chao Liu
Browse files
clipboard float4 copy and paste C++ code
parent
605afd0f
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
246 additions
and
155 deletions
+246
-155
driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
+1
-6
src/include/blockwise_2d_tensor_op.hip.hpp
src/include/blockwise_2d_tensor_op.hip.hpp
+126
-5
src/include/data_type.hip.hpp
src/include/data_type.hip.hpp
+3
-9
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
...implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+65
-115
src/include/inline_asm.hpp
src/include/inline_asm.hpp
+30
-16
src/include/threadwise_gemm.hip.hpp
src/include/threadwise_gemm.hip.hpp
+21
-4
No files found.
driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
View file @
dabfa77f
...
...
@@ -238,9 +238,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr
index_t
GemmNLevel1Cluster
=
4
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmThreadPerColumnPerCluster
=
8
;
constexpr
index_t
GemmThreadPerRowPerCluster
=
8
;
constexpr
index_t
InBlockCopyThreadPerDim0
=
4
;
constexpr
index_t
InBlockCopyThreadPerDim1
=
16
;
...
...
@@ -272,7 +269,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
constexpr
auto
gridwise_conv
=
#if
1
#if
0
GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
#else
GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
...
...
@@ -288,8 +285,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
CPerBlock
,
BPerThread
,
KPerThread
,
GemmThreadPerColumnPerCluster
,
GemmThreadPerRowPerCluster
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMLevel0Cluster
,
...
...
src/include/blockwise_2d_tensor_op.hip.hpp
View file @
dabfa77f
#pragma once
#include "ConstantTensorDescriptor.hip.hpp"
#include "inline_asm.hpp"
template
<
index_t
BlockSize
,
class
Float
,
class
DstDesc
,
class
F
>
__device__
void
...
...
@@ -517,9 +518,9 @@ struct Blockwise2dTensorCopy3
constexpr
index_t
dst_loop_stride
=
DstDesc
{}.
GetStride
(
I0
)
*
thread_per_d0
;
auto
f_copy
=
[
&
](
index_t
iloop
)
{
*
(
reinterpret_cast
<
vector_t
*>
(
p_clipboard
+
iloop
*
4
))
=
*
(
reinterpret_cast
<
const
vector_t
*>
(
p_src
+
mSrcMyThreadOffset
+
iloop
*
src_loop_stride
));
*
(
reinterpret_cast
<
vector_t
*>
(
&
p_clipboard
[
iloop
*
DataPerRead
]
))
=
*
(
reinterpret_cast
<
const
vector_t
*>
(
&
p_src
[
mSrcMyThreadOffset
+
iloop
*
src_loop_stride
]
));
};
for
(
index_t
iloop
=
0
;
iloop
<
nloop_d0
;
++
iloop
)
...
...
@@ -568,8 +569,69 @@ struct Blockwise2dTensorCopy3
constexpr
index_t
dst_loop_stride
=
DstDesc
{}.
GetStride
(
I0
)
*
thread_per_d0
;
auto
f_copy
=
[
&
](
index_t
iloop
)
{
*
(
reinterpret_cast
<
vector_t
*>
(
p_dst
+
mDstMyThreadOffset
+
iloop
*
dst_loop_stride
))
=
*
(
reinterpret_cast
<
const
vector_t
*>
(
p_clipboard
+
iloop
*
4
));
*
(
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
mDstMyThreadOffset
+
iloop
*
dst_loop_stride
]))
=
*
(
reinterpret_cast
<
const
vector_t
*>
(
&
p_clipboard
[
iloop
*
DataPerRead
]));
};
for
(
index_t
iloop
=
0
;
iloop
<
nloop_d0
;
++
iloop
)
{
f_copy
(
iloop
);
}
constexpr
bool
has_tail_d0
=
(
L0
>
nloop_d0
*
thread_per_d0
);
if
(
has_tail_d0
)
{
constexpr
index_t
tail_d0
=
L0
-
nloop_d0
*
thread_per_d0
;
if
(
get_thread_local_1d_id
()
<
tail_d0
*
thread_per_d1
)
{
f_copy
(
nloop_d0
);
}
}
}
#if DEVICE_BACKEND_HIP
__device__
void
RunLoadRegisterClipboard_asm
(
const
Float
*
__restrict__
p_src
,
Float
*
p_clipboard
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
index_t
L0
=
CopyLengths
{}.
Get
(
I0
);
constexpr
index_t
L1
=
CopyLengths
{}.
Get
(
I1
);
constexpr
index_t
thread_per_d1
=
(
L1
+
DataPerRead
-
1
)
/
DataPerRead
;
constexpr
index_t
thread_per_d0
=
BlockSize
/
thread_per_d1
;
constexpr
index_t
num_active_thread
=
thread_per_d0
*
thread_per_d1
;
if
(
BlockSize
>
num_active_thread
)
{
if
(
get_thread_local_1d_id
()
>=
num_active_thread
)
{
return
;
}
}
constexpr
index_t
nloop_d0
=
L0
/
thread_per_d0
;
constexpr
index_t
src_loop_stride
=
SrcDesc
{}.
GetStride
(
I0
)
*
thread_per_d0
;
constexpr
index_t
dst_loop_stride
=
DstDesc
{}.
GetStride
(
I0
)
*
thread_per_d0
;
auto
f_copy
=
[
&
](
index_t
iloop
)
{
#if 0
*(reinterpret_cast<vector_t*>(&p_clipboard[iloop * DataPerRead])) =
*(reinterpret_cast<const vector_t*>(&p_src[mSrcMyThreadOffset +
iloop * src_loop_stride]));
#else
static_assert
(
is_same
<
float
,
Float
>::
value
&&
DataPerRead
==
4
,
"global_load is only for float4"
);
global_load
(
reinterpret_cast
<
vector_t
&>
(
p_clipboard
[
iloop
*
DataPerRead
]),
reinterpret_cast
<
const
vector_t
*>
(
&
p_src
[
mSrcMyThreadOffset
+
iloop
*
src_loop_stride
]));
#endif
};
for
(
index_t
iloop
=
0
;
iloop
<
nloop_d0
;
++
iloop
)
...
...
@@ -589,4 +651,63 @@ struct Blockwise2dTensorCopy3
}
}
}
__device__
void
RunStoreRegisterClipboard_asm
(
const
Float
*
__restrict__
p_clipboard
,
Float
*
__restrict__
p_dst
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
index_t
L0
=
CopyLengths
{}.
Get
(
I0
);
constexpr
index_t
L1
=
CopyLengths
{}.
Get
(
I1
);
constexpr
index_t
thread_per_d1
=
(
L1
+
DataPerRead
-
1
)
/
DataPerRead
;
constexpr
index_t
thread_per_d0
=
BlockSize
/
thread_per_d1
;
constexpr
index_t
num_active_thread
=
thread_per_d0
*
thread_per_d1
;
if
(
BlockSize
>
num_active_thread
)
{
if
(
get_thread_local_1d_id
()
>=
num_active_thread
)
{
return
;
}
}
constexpr
index_t
nloop_d0
=
L0
/
thread_per_d0
;
constexpr
index_t
src_loop_stride
=
SrcDesc
{}.
GetStride
(
I0
)
*
thread_per_d0
;
constexpr
index_t
dst_loop_stride
=
DstDesc
{}.
GetStride
(
I0
)
*
thread_per_d0
;
auto
f_copy
=
[
&
](
index_t
iloop
)
{
#if 0
*(reinterpret_cast<vector_t*>(&p_dst[mDstMyThreadOffset + iloop * dst_loop_stride]) =
*(reinterpret_cast<const vector_t*>(&p_clipboard[iloop * DataPerRead]);
#else
static_assert
(
is_same
<
float
,
Float
>::
value
&&
DataPerRead
==
4
,
"ds_write_b128 is only for float4"
);
ds_write_b128
(
reinterpret_cast
<
const
vector_t
&>
(
p_clipboard
[
iloop
*
DataPerRead
]),
&
p_dst
[
mDstMyThreadOffset
+
iloop
*
dst_loop_stride
]);
#endif
};
for
(
index_t
iloop
=
0
;
iloop
<
nloop_d0
;
++
iloop
)
{
f_copy
(
iloop
);
}
constexpr
bool
has_tail_d0
=
(
L0
>
nloop_d0
*
thread_per_d0
);
if
(
has_tail_d0
)
{
constexpr
index_t
tail_d0
=
L0
-
nloop_d0
*
thread_per_d0
;
if
(
get_thread_local_1d_id
()
<
tail_d0
*
thread_per_d1
)
{
f_copy
(
nloop_d0
);
}
}
}
#endif
};
src/include/data_type.hip.hpp
View file @
dabfa77f
...
...
@@ -9,13 +9,13 @@ struct vector_type
template
<
>
struct
vector_type
<
float
,
1
>
{
using
MemoryType
=
float
;
typedef
float
MemoryType
;
};
template
<
>
struct
vector_type
<
float
,
2
>
{
using
MemoryType
=
float2
;
typedef
float
MemoryType
__attribute__
((
ext_vector_type
(
2
)))
;
__host__
__device__
static
MemoryType
Pack
(
float
s0
,
float
s1
)
{
...
...
@@ -34,13 +34,7 @@ struct vector_type<float, 2>
template
<
>
struct
vector_type
<
float
,
4
>
{
using
MemoryType
=
float4
;
};
template
<
>
struct
vector_type
<
float2
,
2
>
{
using
MemoryType
=
float4
;
typedef
float
MemoryType
__attribute__
((
ext_vector_type
(
4
)));
};
#if 0
...
...
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
View file @
dabfa77f
...
...
@@ -19,8 +19,6 @@ template <index_t GridSize,
index_t
CPerBlock
,
index_t
BPerThread
,
index_t
KPerThread
,
index_t
GemmThreadPerColumnPerCluster
,
index_t
GemmThreadPerRowPerCluster
,
index_t
GemmMPerThreadSubC
,
index_t
GemmNPerThreadSubC
,
index_t
GemmMLevel0Cluster
,
...
...
@@ -98,25 +96,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
constexpr
auto
out_kb_thread_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
,
BPerThread
>
{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_chwn_global_desc, "in_chwn_global_desc");
print_ConstantTensorDescriptor(wei_cyxk_global_desc, "wei_cyxk_global_desc");
print_ConstantTensorDescriptor(out_khwn_global_desc, "out_khwn_global_desc");
print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc");
print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc");
print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc");
print_ConstantTensorDescriptor(wei_cyxk_block_desc, "wei_cyxk_block_desc");
print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc");
print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc");
printf("KPerBlock %u\n", KPerBlock);
}
#endif
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 0
...
...
@@ -188,19 +167,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
constexpr
auto
c_kxb_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
BPerThread
>
{});
#if 0
const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC<BlockSize,
decltype(a_cxk_block_mtx_desc),
decltype(b_cxb_block_mtx_desc),
decltype(c_kxb_thread_mtx_desc),
true,
false,
false,
GemmKPerThreadLoop,
GemmThreadPerColumnPerCluster,
GemmThreadPerRowPerCluster,
true>{};
#else
const
auto
blockwise_gemm
=
BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
<
BlockSize
,
decltype
(
a_cxk_block_mtx_desc
),
...
...
@@ -213,7 +179,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
>
{};
#endif
// LDS: be careful of alignment
constexpr
index_t
max_align
=
...
...
@@ -235,27 +200,9 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
const
Float
*
p_wei_global_block_offset
=
p_wei_global
+
wei_cyxk_global_desc
.
Get1dIndex
(
0
,
0
,
0
,
k_block_data_begin
);
// preload data into LDS
#if 0
blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_0);
blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_0);
#else
Float4
tmp_in
,
tmp_wei
;
Float4
*
glb_in_p
=
(
Float4
*
)(
p_in_global_block_offset
+
blockwise_in_copy
.
mSrcMyThreadOffset
);
Float4
*
glb_wei_p
=
(
Float4
*
)(
p_wei_global_block_offset
+
blockwise_wei_copy
.
mSrcMyThreadOffset
);
global_load
(
tmp_in
,
glb_in_p
);
global_load
(
tmp_wei
,
glb_wei_p
);
Float4
*
loc_in_p
=
(
Float4
*
)(
p_in_block_double
+
blockwise_in_copy
.
mDstMyThreadOffset
);
Float4
*
loc_wei_p
=
(
Float4
*
)(
p_wei_block_double
+
blockwise_wei_copy
.
mDstMyThreadOffset
);
vmcnt
(
0
);
ds_write_b128
(
tmp_in
,
loc_in_p
);
ds_write_b128
(
tmp_wei
,
loc_wei_p
);
#endif
// preload data into LDS
blockwise_in_copy
.
Run
(
p_in_global_block_offset
,
p_in_block_double
);
blockwise_wei_copy
.
Run
(
p_wei_global_block_offset
,
p_wei_block_double
);
// register
Float
p_out_thread
[
out_kb_thread_desc
.
GetElementSpace
()];
...
...
@@ -285,7 +232,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
p_wei_global_block_offset
+=
CPerBlock
*
wei_cyxk_global_desc
.
GetStride
(
I0
);
// load next data
#if
0
#if
1
Float
p_in_register_clipboard
[
blockwise_in_copy
.
GetRegisterClipboardSize
()];
Float
p_wei_register_clipboard
[
blockwise_wei_copy
.
GetRegisterClipboardSize
()];
...
...
@@ -296,7 +243,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
blockwise_wei_copy
.
RunLoadRegisterClipboard
(
p_wei_global_block_offset
,
p_wei_register_clipboard
);
#elif
1
#elif
0
Float4
tmp_in
,
tmp_wei
;
Float4
*
glb_in_p
=
(
Float4
*
)(
p_in_global_block_offset
+
blockwise_in_copy
.
mSrcMyThreadOffset
);
...
...
@@ -328,11 +275,20 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
}
}
#if 0
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_next);
#if 1
blockwise_in_copy
.
RunStoreRegisterClipboard
(
p_in_register_clipboard
,
p_in_block_next
);
blockwise_wei_copy
.
RunStoreRegisterClipboard
(
p_wei_register_clipboard
,
p_wei_block_next
);
#elif
1
#elif 0
// if work with RunLoadRegisterClipboard_asm, need to wait
vmcnt
(
0
);
blockwise_in_copy
.
RunStoreRegisterClipboard_asm
(
p_in_register_clipboard
,
p_in_block_next
);
blockwise_wei_copy
.
RunStoreRegisterClipboard_asm
(
p_wei_register_clipboard
,
p_wei_block_next
);
#elif 0
Float4
*
loc_in_p
=
(
Float4
*
)(
p_in_block_next
+
blockwise_in_copy
.
mDstMyThreadOffset
);
Float4
*
loc_wei_p
=
...
...
@@ -352,16 +308,16 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
p_in_global_block_offset
+=
CPerBlock
*
in_cb_global_desc
.
GetStride
(
I0
);
p_wei_global_block_offset
+=
CPerBlock
*
wei_cyxk_global_desc
.
GetStride
(
I0
);
Float4
tmp_in
,
tmp_wei
;
Float4
*
glb_in_p
=
(
Float4
*
)(
p_in_global_block_offset
+
blockwise_in_copy
.
mSrcMyThreadOffset
);
Float4
*
glb_wei_p
=
(
Float4
*
)(
p_wei_global_block_offset
+
blockwise_wei_copy
.
mSrcMyThreadOffset
);
__syncthreads
();
global_load
(
tmp_in
,
glb_in_p
);
global_load
(
tmp_wei
,
glb_wei_p
);
Float
p_in_register_clipboard
[
blockwise_in_copy
.
GetRegisterClipboardSize
()];
Float
p_wei_register_clipboard
[
blockwise_wei_copy
.
GetRegisterClipboardSize
()];
blockwise_in_copy
.
RunLoadRegisterClipboard
(
p_in_global_block_offset
,
p_in_register_clipboard
);
blockwise_wei_copy
.
RunLoadRegisterClipboard
(
p_wei_global_block_offset
,
p_wei_register_clipboard
);
for
(
index_t
y
=
0
;
y
<
Y
;
++
y
)
{
...
...
@@ -369,10 +325,10 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{
#if 0
blockwise_gemm.Run
#elif
1
blockwise_gemm
.
Run_asm
#elif
0
blockwise_gemm
.
Run_RegisterDoubleBuffer
#elif 1
blockwise_gemm
.
Run_asm
#endif
(
p_wei_block_double
+
wei_cyxk_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
p_in_block_double
+
y
*
Wi
+
x
,
...
...
@@ -380,14 +336,22 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
}
}
Float4
*
loc_in_p
=
(
Float4
*
)(
p_in_block_double
+
in_block_element_space
+
blockwise_in_copy
.
mDstMyThreadOffset
);
Float4
*
loc_wei_p
=
(
Float4
*
)(
p_wei_block_double
+
wei_block_element_space
+
blockwise_wei_copy
.
mDstMyThreadOffset
);
#if 1
blockwise_in_copy
.
RunStoreRegisterClipboard
(
p_in_register_clipboard
,
p_in_block_double
+
in_block_element_space
);
blockwise_wei_copy
.
RunStoreRegisterClipboard
(
p_wei_register_clipboard
,
p_wei_block_double
+
wei_block_element_space
);
#else
// if work with RunLoadRegisterClipboard_asm, need to wait
vmcnt
(
0
);
ds_write_b128
(
tmp_in
,
loc_in_p
);
ds_write_b128
(
tmp_wei
,
loc_wei_p
);
blockwise_in_copy
.
RunStoreRegisterClipboard_asm
(
p_in_register_clipboard
,
p_in_block_double
+
in_block_element_space
);
blockwise_wei_copy
.
RunStoreRegisterClipboard_asm
(
p_wei_register_clipboard
,
p_wei_block_double
+
wei_block_element_space
);
#endif
// odd
__syncthreads
();
...
...
@@ -398,10 +362,10 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{
#if 0
blockwise_gemm.Run
#elif
1
blockwise_gemm
.
Run_asm
#elif
0
blockwise_gemm
.
Run_RegisterDoubleBuffer
#elif 1
blockwise_gemm
.
Run_asm
#endif
(
p_wei_block_double
+
in_block_element_space
+
wei_cyxk_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
...
...
@@ -423,20 +387,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
const
index_t
k_thread_data_begin
=
k_block_data_begin
+
c_thread_mtx_begin
.
row
;
const
index_t
b_thread_data_begin
=
b_block_data_begin
+
c_thread_mtx_begin
.
col
;
#if 0
if(get_block_1d_id() == 0)
{
printf("%u %u, row %u col %u, k_data_begin %u b_data_begin %u, %f %f %f %f\n",
get_block_1d_id(),
get_thread_local_1d_id(),
matrix_c_index.row,
matrix_c_index.col,
k_data_begin,
b_data_begin,
p_out_thread[0], p_out_thread[1], p_out_thread[2], p_out_thread[3]);
}
#endif
for
(
index_t
k
=
0
;
k
<
out_kb_thread_desc
.
GetLength
(
I0
);
++
k
)
{
for
(
index_t
b
=
0
;
b
<
out_kb_thread_desc
.
GetLength
(
I1
);
++
b
)
...
...
src/include/inline_asm.hpp
View file @
dabfa77f
...
...
@@ -10,7 +10,7 @@ extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
#define NO_DS_WRITE 0
#define NO_GLB_READ 0
inline
__device__
void
vmcnt
(
int
cnt
)
inline
__device__
void
vmcnt
(
in
dex_
t
cnt
)
{
#if !NO_VM_WAIT
if
(
cnt
==
0
)
...
...
@@ -39,12 +39,12 @@ inline __device__ void vmcnt(int cnt)
}
else
{
assert
(
0
);
assert
(
false
);
}
#endif
}
inline
__device__
void
lgkmcnt
(
int
cnt
)
inline
__device__
void
lgkmcnt
(
in
dex_
t
cnt
)
{
#if !NO_LGKM_WAIT
if
(
cnt
==
0
)
...
...
@@ -79,7 +79,7 @@ inline __device__ void lgkmcnt(int cnt)
}
else
{
assert
(
0
);
assert
(
false
);
}
#endif
}
...
...
@@ -187,7 +187,7 @@ inline __device__ void outerProduct8x8(const Float4* a, const Float4* b, Float4*
outerProduct4x4
(
a
[
1
],
b
[
1
],
c
[
9
],
c
[
11
],
c
[
13
],
c
[
15
]);
}
inline
__device__
void
ds_read_b128
(
Float4
&
r
,
void
*
lds
,
int
offset
=
0
)
inline
__device__
void
ds_read_b128
(
Float4
&
r
,
void
*
lds
,
in
dex_
t
offset
=
0
)
{
#if !NO_DS_READ
if
(
offset
==
0
)
...
...
@@ -408,29 +408,43 @@ inline __device__ void ds_read_b128(Float4& r, void* lds, int offset = 0)
}
else
{
assert
(
0
);
assert
(
false
);
}
#endif
}
inline
__device__
void
global_load
(
Float4
&
r
,
Float4
*
ptr
)
inline
__device__
void
global_load
(
Float4
&
r
,
const
Float4
*
ptr
,
index_t
offset
=
0
)
{
#if !NO_GLB_READ
if
(
offset
==
0
)
{
asm
volatile
(
"
\n
\
global_load_dwordx4 %0, %1, off
\n
\
"
:
"=v"
(
r
)
:
"v"
(
ptr
));
}
else
{
assert
(
false
);
}
#endif
}
inline
__device__
void
ds_write_b128
(
Float4
&
r
,
void
*
lds
,
int
offset
=
0
)
inline
__device__
void
ds_write_b128
(
const
Float4
&
r
,
void
*
lds
,
in
dex_
t
offset
=
0
)
{
#if !NO_DS_WRITE
if
(
offset
==
0
)
{
asm
volatile
(
"
\n
\
ds_write_b128 %0, %1
\n
\
"
:
:
"v"
(
__to_local
(
lds
)),
"v"
(
r
));
}
else
{
assert
(
false
);
}
#endif
}
src/include/threadwise_gemm.hip.hpp
View file @
dabfa77f
...
...
@@ -13,6 +13,22 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
constexpr
auto
dst_mtx
=
DstMatrix
{};
for
(
index_t
i
=
0
;
i
<
NRow
;
++
i
)
{
// optimize for vector-4 load
if
(
NCol
%
4
==
0
)
{
using
vector_t
=
typename
vector_type
<
Float
,
4
>::
MemoryType
;
for
(
index_t
j
=
0
;
j
<
NCol
/
4
;
++
j
)
{
const
index_t
src_index
=
src_mtx
.
Get1dIndex
(
i
,
4
*
j
);
const
index_t
dst_index
=
dst_mtx
.
Get1dIndex
(
i
,
4
*
j
);
*
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
dst_index
])
=
*
reinterpret_cast
<
const
vector_t
*>
(
&
p_src
[
src_index
]);
}
}
else
{
for
(
index_t
j
=
0
;
j
<
NCol
;
++
j
)
{
...
...
@@ -22,6 +38,7 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
p_dst
[
dst_index
]
=
p_src
[
src_index
];
}
}
}
}
template
<
class
MatrixA
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment