Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
5245a016
"git@developer.sourcefind.cn:OpenDAS/mmdetection3d.git" did not exist on "8c5dd99825fe464e0f73a008a9bc79caadc88014"
Commit
5245a016
authored
Apr 06, 2019
by
Chao Liu
Browse files
clean up
parent
7a251a09
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
69 additions
and
68 deletions
+69
-68
driver/driver.hip.cpp
driver/driver.hip.cpp
+1
-1
src/include/amd_inline_asm.hip.hpp
src/include/amd_inline_asm.hip.hpp
+25
-14
src/include/blockwise_gemm.hip.hpp
src/include/blockwise_gemm.hip.hpp
+10
-0
src/include/common.hip.hpp
src/include/common.hip.hpp
+2
-7
src/include/data_type.hip.hpp
src/include/data_type.hip.hpp
+8
-4
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
...dwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
+3
-3
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
...implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+20
-39
No files found.
driver/driver.hip.cpp
View file @
5245a016
...
@@ -592,7 +592,7 @@ int main(int argc, char* argv[])
...
@@ -592,7 +592,7 @@ int main(int argc, char* argv[])
constexpr
index_t
HPad
=
0
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif
1
#elif
0
// 1x1 filter, 14x14 image, C = 512
// 1x1 filter, 14x14 image, C = 512
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
512
;
constexpr
index_t
C
=
512
;
...
...
src/include/inline_asm.hpp
→
src/include/
amd_
inline_asm.
hip.
hpp
View file @
5245a016
#pragma once
#pragma once
#include "common.hip.hpp"
typedef
float
Float4
__attribute__
((
ext_vector_type
(
4
)));
extern
"C"
__attribute__
((
address_space
(
3
)))
void
*
__to_local
(
void
*
p
)[[
hc
]];
#define NO_VM_WAIT 0
#define NO_VM_WAIT 0
#define NO_LGKM_WAIT 0
#define NO_LGKM_WAIT 0
...
@@ -10,7 +7,10 @@ extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
...
@@ -10,7 +7,10 @@ extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
#define NO_DS_WRITE 0
#define NO_DS_WRITE 0
#define NO_GLB_READ 0
#define NO_GLB_READ 0
inline
__device__
void
vmcnt
(
index_t
cnt
)
// cast a pointer of LDS to its address
extern
"C"
__attribute__
((
address_space
(
3
)))
void
*
__to_local
(
void
*
p
)[[
hc
]];
__device__
void
vmcnt
(
index_t
cnt
)
{
{
#if !NO_VM_WAIT
#if !NO_VM_WAIT
if
(
cnt
==
0
)
if
(
cnt
==
0
)
...
@@ -44,7 +44,7 @@ inline __device__ void vmcnt(index_t cnt)
...
@@ -44,7 +44,7 @@ inline __device__ void vmcnt(index_t cnt)
#endif
#endif
}
}
inline
__device__
void
lgkmcnt
(
index_t
cnt
)
__device__
void
lgkmcnt
(
index_t
cnt
)
{
{
#if !NO_LGKM_WAIT
#if !NO_LGKM_WAIT
if
(
cnt
==
0
)
if
(
cnt
==
0
)
...
@@ -84,7 +84,7 @@ inline __device__ void lgkmcnt(index_t cnt)
...
@@ -84,7 +84,7 @@ inline __device__ void lgkmcnt(index_t cnt)
#endif
#endif
}
}
inline
__device__
void
outerProduct1x4
(
const
float
*
a
,
const
float
*
b
,
float
*
c
)
__device__
void
outerProduct1x4
(
const
float
*
a
,
const
float
*
b
,
float
*
c
)
{
{
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
v_mac_f32 %0, %4, %5
\n
\
v_mac_f32 %0, %4, %5
\n
\
...
@@ -104,7 +104,9 @@ inline __device__ void outerProduct1x4(const float* a, const float* b, float* c)
...
@@ -104,7 +104,9 @@ inline __device__ void outerProduct1x4(const float* a, const float* b, float* c)
"3"
(
c
[
3
]));
"3"
(
c
[
3
]));
}
}
inline
__device__
void
outerProduct1x4
(
const
float
&
a
,
const
Float4
&
b
,
Float4
&
c
)
__device__
void
outerProduct1x4
(
const
float
&
a
,
const
vector_type
<
float
,
4
>::
MemoryType
&
b
,
vector_type
<
float
,
4
>::
MemoryType
&
c
)
{
{
#if 0
#if 0
asm volatile(
asm volatile(
...
@@ -123,8 +125,12 @@ inline __device__ void outerProduct1x4(const float& a, const Float4& b, Float4&
...
@@ -123,8 +125,12 @@ inline __device__ void outerProduct1x4(const float& a, const Float4& b, Float4&
#endif
#endif
}
}
inline
__device__
void
__device__
void
outerProduct4x4
(
const
vector_type
<
float
,
4
>::
MemoryType
&
a
,
outerProduct4x4
(
const
Float4
&
a
,
const
Float4
&
b
,
Float4
&
c0
,
Float4
&
c1
,
Float4
&
c2
,
Float4
&
c3
)
const
vector_type
<
float
,
4
>::
MemoryType
&
b
,
vector_type
<
float
,
4
>::
MemoryType
&
c0
,
vector_type
<
float
,
4
>::
MemoryType
&
c1
,
vector_type
<
float
,
4
>::
MemoryType
&
c2
,
vector_type
<
float
,
4
>::
MemoryType
&
c3
)
{
{
#if 0
#if 0
asm volatile(
asm volatile(
...
@@ -179,7 +185,9 @@ outerProduct4x4(const Float4& a, const Float4& b, Float4& c0, Float4& c1, Float4
...
@@ -179,7 +185,9 @@ outerProduct4x4(const Float4& a, const Float4& b, Float4& c0, Float4& c1, Float4
#endif
#endif
}
}
inline
__device__
void
outerProduct8x8
(
const
Float4
*
a
,
const
Float4
*
b
,
Float4
*
c
)
__device__
void
outerProduct8x8
(
const
vector_type
<
float
,
4
>::
MemoryType
*
a
,
const
vector_type
<
float
,
4
>::
MemoryType
*
b
,
vector_type
<
float
,
4
>::
MemoryType
*
c
)
{
{
outerProduct4x4
(
a
[
0
],
b
[
0
],
c
[
0
],
c
[
2
],
c
[
4
],
c
[
6
]);
outerProduct4x4
(
a
[
0
],
b
[
0
],
c
[
0
],
c
[
2
],
c
[
4
],
c
[
6
]);
outerProduct4x4
(
a
[
0
],
b
[
1
],
c
[
1
],
c
[
3
],
c
[
5
],
c
[
7
]);
outerProduct4x4
(
a
[
0
],
b
[
1
],
c
[
1
],
c
[
3
],
c
[
5
],
c
[
7
]);
...
@@ -187,7 +195,7 @@ inline __device__ void outerProduct8x8(const Float4* a, const Float4* b, Float4*
...
@@ -187,7 +195,7 @@ inline __device__ void outerProduct8x8(const Float4* a, const Float4* b, Float4*
outerProduct4x4
(
a
[
1
],
b
[
1
],
c
[
9
],
c
[
11
],
c
[
13
],
c
[
15
]);
outerProduct4x4
(
a
[
1
],
b
[
1
],
c
[
9
],
c
[
11
],
c
[
13
],
c
[
15
]);
}
}
inline
__device__
void
ds_read_b128
(
Float4
&
r
,
void
*
lds
,
index_t
offset
=
0
)
__device__
void
ds_read_b128
(
vector_type
<
float
,
4
>::
MemoryType
&
r
,
void
*
lds
,
index_t
offset
=
0
)
{
{
#if !NO_DS_READ
#if !NO_DS_READ
if
(
offset
==
0
)
if
(
offset
==
0
)
...
@@ -413,7 +421,9 @@ inline __device__ void ds_read_b128(Float4& r, void* lds, index_t offset = 0)
...
@@ -413,7 +421,9 @@ inline __device__ void ds_read_b128(Float4& r, void* lds, index_t offset = 0)
#endif
#endif
}
}
inline
__device__
void
global_load
(
Float4
&
r
,
const
Float4
*
ptr
,
index_t
offset
=
0
)
__device__
void
global_load
(
vector_type
<
float
,
4
>::
MemoryType
&
r
,
const
vector_type
<
float
,
4
>::
MemoryType
*
ptr
,
index_t
offset
=
0
)
{
{
#if !NO_GLB_READ
#if !NO_GLB_READ
if
(
offset
==
0
)
if
(
offset
==
0
)
...
@@ -431,7 +441,8 @@ inline __device__ void global_load(Float4& r, const Float4* ptr, index_t offset
...
@@ -431,7 +441,8 @@ inline __device__ void global_load(Float4& r, const Float4* ptr, index_t offset
#endif
#endif
}
}
inline
__device__
void
ds_write_b128
(
const
Float4
&
r
,
void
*
lds
,
index_t
offset
=
0
)
__device__
void
ds_write_b128
(
const
vector_type
<
float
,
4
>::
MemoryType
&
r
,
void
*
lds
,
index_t
offset
=
0
)
{
{
#if !NO_DS_WRITE
#if !NO_DS_WRITE
if
(
offset
==
0
)
if
(
offset
==
0
)
...
...
src/include/blockwise_gemm.hip.hpp
View file @
5245a016
...
@@ -130,6 +130,10 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
...
@@ -130,6 +130,10 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
const
FloatB
*
__restrict__
p_b_block
,
const
FloatB
*
__restrict__
p_b_block
,
FloatC
*
__restrict__
p_c_thread
)
const
FloatC
*
__restrict__
p_c_thread
)
const
{
{
static_assert
(
is_same
<
FloatA
,
float
>::
value
&&
is_same
<
FloatB
,
float
>::
value
&&
is_same
<
FloatC
,
float
>::
value
,
"Run_asm only deal with float
\n
"
);
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
False
=
integral_constant
<
bool
,
false
>
{};
constexpr
auto
False
=
integral_constant
<
bool
,
false
>
{};
...
@@ -158,6 +162,12 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
...
@@ -158,6 +162,12 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
constexpr
auto
b_thread_sub_mtx
=
make_ConstantMatrixDescriptor
(
constexpr
auto
b_thread_sub_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
NPerThreadSubC
>
{},
Number
<
NPerThread
>
{});
Number
<
KPerThreadLoop
>
{},
Number
<
NPerThreadSubC
>
{},
Number
<
NPerThread
>
{});
static_assert
(
MPerThreadSubC
==
4
&&
NPerThreadSubC
==
4
&&
KPerThreadLoop
==
1
&&
MPerThread
==
8
&&
NPerThread
==
8
,
"Run_asm cannot deal with this GEMM shape yet
\n
"
);
using
Float4
=
vector_type
<
float
,
4
>::
MemoryType
;
float
p_thread
[
a_thread_mtx
.
GetElementSpace
()
+
b_thread_mtx
.
GetElementSpace
()];
float
p_thread
[
a_thread_mtx
.
GetElementSpace
()
+
b_thread_mtx
.
GetElementSpace
()];
FloatA
*
p_a_thread
=
p_thread
;
FloatA
*
p_a_thread
=
p_thread
;
...
...
src/include/common.hip.hpp
View file @
5245a016
...
@@ -5,8 +5,8 @@
...
@@ -5,8 +5,8 @@
#include "Array.hip.hpp"
#include "Array.hip.hpp"
#include "functional.hip.hpp"
#include "functional.hip.hpp"
#if DEVICE_BACKE
D
N_HIP
#if DEVICE_BACKEN
D
_HIP
#include "inline_asm.hpp"
#include "
amd_
inline_asm.
hip.
hpp"
#endif
#endif
__device__
index_t
get_thread_local_1d_id
()
{
return
threadIdx
.
x
;
}
__device__
index_t
get_thread_local_1d_id
()
{
return
threadIdx
.
x
;
}
...
@@ -67,8 +67,3 @@ __host__ __device__ constexpr T min(T x, Ts... xs)
...
@@ -67,8 +67,3 @@ __host__ __device__ constexpr T min(T x, Ts... xs)
return
x
<
y
?
x
:
y
;
return
x
<
y
?
x
:
y
;
}
}
}
// namespace mod_conv
}
// namespace mod_conv
#if DEVICE_BACKEND_HIP
// cast a pointer of LDS to its address
extern
"C"
__attribute__
((
address_space
(
3
)))
void
*
__to_local
(
void
*
p
)[[
hc
]];
#endif
src/include/data_type.hip.hpp
View file @
5245a016
...
@@ -16,12 +16,14 @@ template <>
...
@@ -16,12 +16,14 @@ template <>
struct
vector_type
<
float
,
2
>
struct
vector_type
<
float
,
2
>
{
{
#if DEVICE_BACKEND_HIP
#if DEVICE_BACKEND_HIP
// For some reason, HIP compiler need this definition to generate optimal load and store instruction
// For some reason, HIP compiler need this definition to generate optimal load and store
// instruction
typedef
float
MemoryType
__attribute__
((
ext_vector_type
(
2
)));
typedef
float
MemoryType
__attribute__
((
ext_vector_type
(
2
)));
#elif DEVICE_BACKEND_CUDA
#elif DEVICE_BACKEND_CUDA
// For some reason, CUDA need this definition to, otherwise
// For some reason, CUDA need this definition to, otherwise
// compiler won't generate optimal load and store instruction, and
// compiler won't generate optimal load and store instruction, and
// kernel would produce wrong result, indicating the compiler fail to generate correct instruction,
// kernel would produce wrong result, indicating the compiler fail to generate correct
// instruction,
using
MemoryType
=
float2
;
using
MemoryType
=
float2
;
#endif
#endif
...
@@ -43,12 +45,14 @@ template <>
...
@@ -43,12 +45,14 @@ template <>
struct
vector_type
<
float
,
4
>
struct
vector_type
<
float
,
4
>
{
{
#if DEVICE_BACKEND_HIP
#if DEVICE_BACKEND_HIP
// For some reason, HIP compiler need this definition to generate optimal load and store instruction
// For some reason, HIP compiler need this definition to generate optimal load and store
// instruction
typedef
float
MemoryType
__attribute__
((
ext_vector_type
(
4
)));
typedef
float
MemoryType
__attribute__
((
ext_vector_type
(
4
)));
#elif DEVICE_BACKEND_CUDA
#elif DEVICE_BACKEND_CUDA
// For some reason, CUDA need this definition to, otherwise
// For some reason, CUDA need this definition to, otherwise
// compiler won't generate optimal load and store instruction, and
// compiler won't generate optimal load and store instruction, and
// kernel would produce wrong result, indicating the compiler fail to generate correct instruction,
// kernel would produce wrong result, indicating the compiler fail to generate correct
// instruction,
using
MemoryType
=
float4
;
using
MemoryType
=
float4
;
#endif
#endif
};
};
...
...
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
View file @
5245a016
...
@@ -239,9 +239,9 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
...
@@ -239,9 +239,9 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
#elif 0
#elif 0
blockwise_gemm
.
Run_asm
blockwise_gemm
.
Run_asm
#endif
#endif
(
p_wei_block
+
wei_cyxk_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
(
p_wei_block
+
wei_cyxk_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
p_in_block
+
y
*
Wi
+
x
,
p_in_block
+
y
*
Wi
+
x
,
p_out_thread
);
p_out_thread
);
}
}
}
}
}
}
...
...
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
View file @
5245a016
...
@@ -34,11 +34,6 @@ template <index_t GridSize,
...
@@ -34,11 +34,6 @@ template <index_t GridSize,
index_t
WeiBlockCopyDataPerRead
>
index_t
WeiBlockCopyDataPerRead
>
struct
GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
struct
GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{
{
__host__
__device__
constexpr
GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
()
{
}
__device__
void
Run
(
const
Float
*
const
__restrict__
p_in_global
,
__device__
void
Run
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
const
Float
*
const
__restrict__
p_out_global
)
const
...
@@ -203,7 +198,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
...
@@ -203,7 +198,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
// preload data into LDS
// preload data into LDS
{
{
#if 1
Float
p_in_register_clipboard
[
blockwise_in_copy
.
GetRegisterClipboardSize
()];
Float
p_in_register_clipboard
[
blockwise_in_copy
.
GetRegisterClipboardSize
()];
Float
p_wei_register_clipboard
[
blockwise_wei_copy
.
GetRegisterClipboardSize
()];
Float
p_wei_register_clipboard
[
blockwise_wei_copy
.
GetRegisterClipboardSize
()];
...
@@ -212,19 +206,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
...
@@ -212,19 +206,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
blockwise_wei_copy
.
RunLoadRegisterClipboard
(
p_wei_global_block_offset
,
blockwise_wei_copy
.
RunLoadRegisterClipboard
(
p_wei_global_block_offset
,
p_wei_register_clipboard
);
p_wei_register_clipboard
);
blockwise_in_copy
.
RunStoreRegisterClipboard
(
p_in_register_clipboard
,
#if 0
p_in_block_double
);
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
p_in_block_double);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_double
);
p_wei_block_double);
#elif 0
#elif
1
Float
p_in_register_clipboard
[
blockwise_in_copy
.
GetRegisterClipboardSize
()];
Float
p_wei_register_clipboard
[
blockwise_wei_copy
.
GetRegisterClipboardSize
()];
blockwise_in_copy
.
RunLoadRegisterClipboard
(
p_in_global_block_offset
,
p_in_register_clipboard
);
blockwise_wei_copy
.
RunLoadRegisterClipboard
(
p_wei_global_block_offset
,
p_wei_register_clipboard
);
vmcnt
(
0
);
vmcnt
(
0
);
blockwise_in_copy
.
RunStoreRegisterClipboard_asm
(
p_in_register_clipboard
,
blockwise_in_copy
.
RunStoreRegisterClipboard_asm
(
p_in_register_clipboard
,
p_in_block_double
);
p_in_block_double
);
...
@@ -278,28 +264,26 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
...
@@ -278,28 +264,26 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{
{
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
{
{
#if
1
#if
0
blockwise_gemm.Run
blockwise_gemm.Run
#elif
0
#elif
0
blockwise_gemm
.
Run_RegisterDoubleBuffer
blockwise_gemm
.
Run_RegisterDoubleBuffer
#elif 1
#elif 1
blockwise_gemm
.
Run_asm
blockwise_gemm
.
Run_asm
#endif
#endif
(
p_wei_block_now
+
wei_cyxk_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
(
p_wei_block_now
+
wei_cyxk_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
p_in_block_now
+
y
*
Wi
+
x
,
p_in_block_now
+
y
*
Wi
+
x
,
p_out_thread
);
p_out_thread
);
}
}
}
}
#if
1
#if
0
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
p_in_block_next);
p_in_block_next);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_next);
p_wei_block_next);
#elif 0
#elif
1
// if work with RunLoadRegisterClipboard_asm, need to wait
vmcnt
(
0
);
vmcnt
(
0
);
blockwise_in_copy
.
RunStoreRegisterClipboard_asm
(
p_in_register_clipboard
,
blockwise_in_copy
.
RunStoreRegisterClipboard_asm
(
p_in_register_clipboard
,
p_in_block_next
);
p_in_block_next
);
blockwise_wei_copy
.
RunStoreRegisterClipboard_asm
(
p_wei_register_clipboard
,
blockwise_wei_copy
.
RunStoreRegisterClipboard_asm
(
p_wei_register_clipboard
,
...
@@ -329,32 +313,29 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
...
@@ -329,32 +313,29 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{
{
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
{
{
#if
1
#if
0
blockwise_gemm.Run
blockwise_gemm.Run
#elif
0
#elif
0
blockwise_gemm
.
Run_RegisterDoubleBuffer
blockwise_gemm
.
Run_RegisterDoubleBuffer
#elif 1
#elif 1
blockwise_gemm
.
Run_asm
blockwise_gemm
.
Run_asm
#endif
#endif
(
p_wei_block_double
+
wei_cyxk_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
(
p_wei_block_double
+
wei_cyxk_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
p_in_block_double
+
y
*
Wi
+
x
,
p_in_block_double
+
y
*
Wi
+
x
,
p_out_thread
);
p_out_thread
);
}
}
}
}
#if
1
#if
0
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
p_in_block_double + in_block_space);
p_in_block_double + in_block_space);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_double + wei_block_space);
p_wei_block_double + wei_block_space);
#else
#else
// if work with RunLoadRegisterClipboard_asm, need to wait
vmcnt
(
0
);
vmcnt
(
0
);
blockwise_in_copy
.
RunStoreRegisterClipboard_asm
(
p_in_register_clipboard
,
blockwise_in_copy
.
RunStoreRegisterClipboard_asm
(
p_in_register_clipboard
,
p_in_block_double
+
in_block_space
);
p_in_block_double
+
in_block_space
);
blockwise_wei_copy
.
RunStoreRegisterClipboard_asm
(
p_wei_register_clipboard
,
blockwise_wei_copy
.
RunStoreRegisterClipboard_asm
(
p_wei_register_clipboard
,
p_wei_block_double
+
wei_block_space
);
p_wei_block_double
+
wei_block_space
);
#endif
#endif
...
@@ -366,17 +347,17 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
...
@@ -366,17 +347,17 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{
{
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
{
{
#if
1
#if
0
blockwise_gemm.Run
blockwise_gemm.Run
#elif
0
#elif
0
blockwise_gemm
.
Run_RegisterDoubleBuffer
blockwise_gemm
.
Run_RegisterDoubleBuffer
#elif 1
#elif 1
blockwise_gemm
.
Run_asm
blockwise_gemm
.
Run_asm
#endif
#endif
(
p_wei_block_double
+
wei_block_space
+
(
p_wei_block_double
+
wei_block_space
+
wei_cyxk_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
wei_cyxk_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
p_in_block_double
+
in_block_space
+
y
*
Wi
+
x
,
p_in_block_double
+
in_block_space
+
y
*
Wi
+
x
,
p_out_thread
);
p_out_thread
);
}
}
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment