Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
0983d205
Commit
0983d205
authored
Apr 05, 2019
by
Chao Liu
Browse files
debugging
parent
bae23337
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
62 additions
and
109 deletions
+62
-109
driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
+1
-4
driver/driver.hip.cpp
driver/driver.hip.cpp
+2
-2
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
...dwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
+17
-39
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
...implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+38
-43
src/include/threadwise_gemm.hip.hpp
src/include/threadwise_gemm.hip.hpp
+4
-21
No files found.
driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
View file @
0983d205
...
@@ -191,7 +191,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
...
@@ -191,7 +191,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr
index_t
WeiBlockCopyDataPerRead
=
4
;
constexpr
index_t
WeiBlockCopyDataPerRead
=
4
;
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
#elif
0
#elif
1
// 1x1, 14x14, Vega 20, disable lds_double_buffer, enable register double buffer
// 1x1, 14x14, Vega 20, disable lds_double_buffer, enable register double buffer
constexpr
index_t
BPerBlock
=
64
;
constexpr
index_t
BPerBlock
=
64
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
128
;
...
@@ -208,9 +208,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
...
@@ -208,9 +208,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr
index_t
GemmNLevel1Cluster
=
4
;
constexpr
index_t
GemmNLevel1Cluster
=
4
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmThreadPerColumnPerCluster
=
8
;
constexpr
index_t
GemmThreadPerRowPerCluster
=
8
;
constexpr
index_t
InBlockCopyThreadPerDim0
=
4
;
constexpr
index_t
InBlockCopyThreadPerDim0
=
4
;
constexpr
index_t
InBlockCopyThreadPerDim1
=
16
;
constexpr
index_t
InBlockCopyThreadPerDim1
=
16
;
...
...
driver/driver.hip.cpp
View file @
0983d205
...
@@ -580,7 +580,7 @@ int main(int argc, char* argv[])
...
@@ -580,7 +580,7 @@ int main(int argc, char* argv[])
constexpr
index_t
HPad
=
0
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif
1
#elif
0
// 1x1 filter, 14x14 image, C = 2048
// 1x1 filter, 14x14 image, C = 2048
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
2048
;
constexpr
index_t
C
=
2048
;
...
@@ -592,7 +592,7 @@ int main(int argc, char* argv[])
...
@@ -592,7 +592,7 @@ int main(int argc, char* argv[])
constexpr
index_t
HPad
=
0
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif
0
#elif
1
// 1x1 filter, 14x14 image, C = 512
// 1x1 filter, 14x14 image, C = 512
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
512
;
constexpr
index_t
C
=
512
;
...
...
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
View file @
0983d205
...
@@ -19,8 +19,6 @@ template <index_t GridSize,
...
@@ -19,8 +19,6 @@ template <index_t GridSize,
index_t
CPerBlock
,
index_t
CPerBlock
,
index_t
BPerThread
,
index_t
BPerThread
,
index_t
KPerThread
,
index_t
KPerThread
,
index_t
GemmThreadPerColumnPerCluster
,
index_t
GemmThreadPerRowPerCluster
,
index_t
GemmMPerThreadSubC
,
index_t
GemmMPerThreadSubC
,
index_t
GemmNPerThreadSubC
,
index_t
GemmNPerThreadSubC
,
index_t
GemmMLevel0Cluster
,
index_t
GemmMLevel0Cluster
,
...
@@ -95,25 +93,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
...
@@ -95,25 +93,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
constexpr
auto
out_kb_thread_desc
=
constexpr
auto
out_kb_thread_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
,
BPerThread
>
{});
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
,
BPerThread
>
{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_chwn_global_desc, "in_chwn_global_desc");
print_ConstantTensorDescriptor(wei_cyxk_global_desc, "wei_cyxk_global_desc");
print_ConstantTensorDescriptor(out_khwn_global_desc, "out_khwn_global_desc");
print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc");
print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc");
print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc");
print_ConstantTensorDescriptor(wei_cyxk_block_desc, "wei_cyxk_block_desc");
print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc");
print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc");
printf("KPerBlock %u\n", KPerBlock);
}
#endif
// blockwise in copy
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 0
#if 0
...
@@ -202,14 +181,13 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
...
@@ -202,14 +181,13 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
constexpr
index_t
max_align
=
constexpr
index_t
max_align
=
mod_conv
::
max
(
index_t
(
4
),
InBlockCopyDataPerRead
,
WeiBlockCopyDataPerRead
);
mod_conv
::
max
(
index_t
(
4
),
InBlockCopyDataPerRead
,
WeiBlockCopyDataPerRead
);
constexpr
index_t
in_block_element_space
=
constexpr
index_t
in_block_space
=
in_cb_block_desc
.
GetElementSpace
(
Number
<
max_align
>
{});
in_cb_block_desc
.
GetElementSpace
(
Number
<
max_align
>
{});
constexpr
index_t
wei_block_
element_
space
=
constexpr
index_t
wei_block_space
=
wei_cyxk_block_desc
.
GetElementSpace
(
Number
<
max_align
>
{});
wei_cyxk_block_desc
.
GetElementSpace
(
Number
<
max_align
>
{});
__shared__
Float
p_in_block
[
in_block_
element_
space
];
__shared__
Float
p_in_block
[
in_block_space
];
__shared__
Float
p_wei_block
[
wei_block_
element_
space
];
__shared__
Float
p_wei_block
[
wei_block_space
];
const
Float
*
p_in_global_block_offset
=
const
Float
*
p_in_global_block_offset
=
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
0
,
b_block_data_begin
);
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
0
,
b_block_data_begin
);
...
@@ -229,7 +207,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
...
@@ -229,7 +207,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
__syncthreads
())
__syncthreads
())
{
{
// load data
// load data
#if
0
#if
1
blockwise_in_copy
.
Run
(
p_in_global_block_offset
,
p_in_block
);
blockwise_in_copy
.
Run
(
p_in_global_block_offset
,
p_in_block
);
blockwise_wei_copy
.
Run
(
p_wei_global_block_offset
,
p_wei_block
);
blockwise_wei_copy
.
Run
(
p_wei_global_block_offset
,
p_wei_block
);
#elif 0
#elif 0
...
...
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
View file @
0983d205
...
@@ -67,6 +67,8 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
...
@@ -67,6 +67,8 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
constexpr
index_t
B
=
N
*
Hi
*
Wi
;
constexpr
index_t
B
=
N
*
Hi
*
Wi
;
constexpr
index_t
BGhostRead
=
(
Y
-
1
)
*
Wi
+
(
X
-
1
);
constexpr
index_t
BGhostRead
=
(
Y
-
1
)
*
Wi
+
(
X
-
1
);
static_assert
(
C
%
(
2
*
CPerBlock
)
==
0
,
"C cannot be evenly divided"
);
// divide block work by 2d: [K, B]
// divide block work by 2d: [K, B]
constexpr
index_t
KBlockWork
=
(
K
+
KPerBlock
-
1
)
/
KPerBlock
;
constexpr
index_t
KBlockWork
=
(
K
+
KPerBlock
-
1
)
/
KPerBlock
;
constexpr
index_t
BBlockWork
=
(
B
+
BPerBlock
-
1
)
/
BPerBlock
;
constexpr
index_t
BBlockWork
=
(
B
+
BPerBlock
-
1
)
/
BPerBlock
;
...
@@ -184,15 +186,14 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
...
@@ -184,15 +186,14 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
constexpr
index_t
max_align
=
constexpr
index_t
max_align
=
mod_conv
::
max
(
index_t
(
4
),
InBlockCopyDataPerRead
,
WeiBlockCopyDataPerRead
);
mod_conv
::
max
(
index_t
(
4
),
InBlockCopyDataPerRead
,
WeiBlockCopyDataPerRead
);
constexpr
index_t
in_block_element_space
=
constexpr
index_t
in_block_space
=
in_cb_block_desc
.
GetElementSpace
(
Number
<
max_align
>
{});
in_cb_block_desc
.
GetElementSpace
(
Number
<
max_align
>
{});
constexpr
index_t
wei_block_
element_
space
=
constexpr
index_t
wei_block_space
=
wei_cyxk_block_desc
.
GetElementSpace
(
Number
<
max_align
>
{});
wei_cyxk_block_desc
.
GetElementSpace
(
Number
<
max_align
>
{});
// LDS double buffer
// LDS double buffer
__shared__
Float
p_in_block_double
[
2
*
in_block_
element_
space
];
__shared__
Float
p_in_block_double
[
2
*
in_block_space
];
__shared__
Float
p_wei_block_double
[
2
*
wei_block_
element_
space
];
__shared__
Float
p_wei_block_double
[
2
*
wei_block_space
];
const
Float
*
p_in_global_block_offset
=
const
Float
*
p_in_global_block_offset
=
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
0
,
b_block_data_begin
);
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
0
,
b_block_data_begin
);
...
@@ -202,10 +203,10 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
...
@@ -202,10 +203,10 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
// preload data into LDS
// preload data into LDS
{
{
#if
0
#if
1
blockwise_in_copy
.
Run
(
p_in_global_block_offset
,
p_in_block_double
);
blockwise_in_copy
.
Run
(
p_in_global_block_offset
,
p_in_block_double
);
blockwise_wei_copy
.
Run
(
p_wei_global_block_offset
,
p_wei_block_double
);
blockwise_wei_copy
.
Run
(
p_wei_global_block_offset
,
p_wei_block_double
);
#elif
1
#elif
0
Float
p_in_register_clipboard
[
blockwise_in_copy
.
GetRegisterClipboardSize
()];
Float
p_in_register_clipboard
[
blockwise_in_copy
.
GetRegisterClipboardSize
()];
Float
p_wei_register_clipboard
[
blockwise_wei_copy
.
GetRegisterClipboardSize
()];
Float
p_wei_register_clipboard
[
blockwise_wei_copy
.
GetRegisterClipboardSize
()];
...
@@ -237,22 +238,22 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
...
@@ -237,22 +238,22 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
const
bool
even_loop
=
(
iloop
%
2
==
0
);
const
bool
even_loop
=
(
iloop
%
2
==
0
);
Float
*
p_in_block_now
=
Float
*
p_in_block_now
=
even_loop
?
p_in_block_double
:
p_in_block_double
+
in_block_
element_
space
;
even_loop
?
p_in_block_double
:
p_in_block_double
+
in_block_space
;
Float
*
p_wei_block_now
=
Float
*
p_wei_block_now
=
even_loop
?
p_wei_block_double
:
p_wei_block_double
+
wei_block_
element_
space
;
even_loop
?
p_wei_block_double
:
p_wei_block_double
+
wei_block_space
;
Float
*
p_in_block_next
=
Float
*
p_in_block_next
=
even_loop
?
p_in_block_double
+
in_block_
element_
space
:
p_in_block_double
;
even_loop
?
p_in_block_double
+
in_block_space
:
p_in_block_double
;
Float
*
p_wei_block_next
=
Float
*
p_wei_block_next
=
even_loop
?
p_wei_block_double
+
wei_block_element_space
:
p_wei_block_double
;
even_loop
?
p_wei_block_double
+
wei_block_space
:
p_wei_block_double
;
p_in_global_block_offset
+=
CPerBlock
*
in_cb_global_desc
.
GetStride
(
I0
);
p_wei_global_block_offset
+=
CPerBlock
*
wei_cyxk_global_desc
.
GetStride
(
I0
);
// load next data
// load next data
Float
p_in_register_clipboard
[
blockwise_in_copy
.
GetRegisterClipboardSize
()];
Float
p_in_register_clipboard
[
blockwise_in_copy
.
GetRegisterClipboardSize
()];
Float
p_wei_register_clipboard
[
blockwise_wei_copy
.
GetRegisterClipboardSize
()];
Float
p_wei_register_clipboard
[
blockwise_wei_copy
.
GetRegisterClipboardSize
()];
p_in_global_block_offset
+=
CPerBlock
*
in_cb_global_desc
.
GetStride
(
I0
);
p_wei_global_block_offset
+=
CPerBlock
*
wei_cyxk_global_desc
.
GetStride
(
I0
);
__syncthreads
();
__syncthreads
();
blockwise_in_copy
.
RunLoadRegisterClipboard
(
p_in_global_block_offset
,
blockwise_in_copy
.
RunLoadRegisterClipboard
(
p_in_global_block_offset
,
...
@@ -267,7 +268,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
...
@@ -267,7 +268,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{
{
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
{
{
#if
0
#if
1
blockwise_gemm
.
Run
blockwise_gemm
.
Run
#elif 0
#elif 0
blockwise_gemm
.
Run_RegisterDoubleBuffer
blockwise_gemm
.
Run_RegisterDoubleBuffer
...
@@ -280,12 +281,12 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
...
@@ -280,12 +281,12 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
}
}
}
}
#if
0
#if
1
blockwise_in_copy
.
RunStoreRegisterClipboard
(
p_in_register_clipboard
,
blockwise_in_copy
.
RunStoreRegisterClipboard
(
p_in_register_clipboard
,
p_in_block_next
);
p_in_block_next
);
blockwise_wei_copy
.
RunStoreRegisterClipboard
(
p_wei_register_clipboard
,
blockwise_wei_copy
.
RunStoreRegisterClipboard
(
p_wei_register_clipboard
,
p_wei_block_next
);
p_wei_block_next
);
#elif
1
#elif
0
// if work with RunLoadRegisterClipboard_asm, need to wait
// if work with RunLoadRegisterClipboard_asm, need to wait
vmcnt
(
0
);
vmcnt
(
0
);
...
@@ -298,7 +299,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
...
@@ -298,7 +299,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
}
}
// tail
// tail
if
(
C
%
2
==
0
)
{
{
// even
// even
p_in_global_block_offset
+=
CPerBlock
*
in_cb_global_desc
.
GetStride
(
I0
);
p_in_global_block_offset
+=
CPerBlock
*
in_cb_global_desc
.
GetStride
(
I0
);
...
@@ -319,7 +319,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
...
@@ -319,7 +319,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{
{
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
{
{
#if
0
#if
1
blockwise_gemm
.
Run
blockwise_gemm
.
Run
#elif 0
#elif 0
blockwise_gemm
.
Run_RegisterDoubleBuffer
blockwise_gemm
.
Run_RegisterDoubleBuffer
...
@@ -332,21 +332,21 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
...
@@ -332,21 +332,21 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
}
}
}
}
#if
0
#if
1
blockwise_in_copy
.
RunStoreRegisterClipboard
(
p_in_register_clipboard
,
blockwise_in_copy
.
RunStoreRegisterClipboard
(
p_in_register_clipboard
,
p_in_block_double + in_block_
element_
space);
p_in_block_double
+
in_block_space
);
blockwise_wei_copy.RunStoreRegisterClipboard(
blockwise_wei_copy
.
RunStoreRegisterClipboard
(
p_wei_register_clipboard
,
p_wei_register_clipboard,
p_wei_block_double + wei_block_
element_
space);
p_wei_block_double
+
wei_block_space
);
#else
#else
// if work with RunLoadRegisterClipboard_asm, need to wait
// if work with RunLoadRegisterClipboard_asm, need to wait
vmcnt
(
0
);
vmcnt
(
0
);
blockwise_in_copy
.
RunStoreRegisterClipboard_asm
(
blockwise_in_copy
.
RunStoreRegisterClipboard_asm
(
p_in_register_clipboard
,
p_in_register_clipboard
,
p_in_block_double
+
in_block_
element_
space
);
p_in_block_double
+
in_block_space
);
blockwise_wei_copy
.
RunStoreRegisterClipboard_asm
(
blockwise_wei_copy
.
RunStoreRegisterClipboard_asm
(
p_wei_register_clipboard
,
p_wei_register_clipboard
,
p_wei_block_double
+
wei_block_
element_
space
);
p_wei_block_double
+
wei_block_space
);
#endif
#endif
// odd
// odd
...
@@ -356,25 +356,20 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
...
@@ -356,25 +356,20 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{
{
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
{
{
#if
0
#if
1
blockwise_gemm
.
Run
blockwise_gemm
.
Run
#elif 0
#elif 0
blockwise_gemm
.
Run_RegisterDoubleBuffer
blockwise_gemm
.
Run_RegisterDoubleBuffer
#elif 1
#elif 1
blockwise_gemm
.
Run_asm
blockwise_gemm
.
Run_asm
#endif
#endif
(
p_wei_block_double
+
in_block_
element_
space
+
(
p_wei_block_double
+
in_block_space
+
wei_cyxk_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
wei_cyxk_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
p_in_block_double
+
wei_block_
element_
space
+
y
*
Wi
+
x
,
p_in_block_double
+
wei_block_space
+
y
*
Wi
+
x
,
p_out_thread
);
p_out_thread
);
}
}
}
}
}
}
else
{
// not implemented
assert
(
false
);
}
// output: register to global mem,
// output: register to global mem,
const
auto
c_thread_mtx_begin
=
const
auto
c_thread_mtx_begin
=
...
...
src/include/threadwise_gemm.hip.hpp
View file @
0983d205
...
@@ -13,22 +13,6 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
...
@@ -13,22 +13,6 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
constexpr
auto
dst_mtx
=
DstMatrix
{};
constexpr
auto
dst_mtx
=
DstMatrix
{};
for
(
index_t
i
=
0
;
i
<
NRow
;
++
i
)
for
(
index_t
i
=
0
;
i
<
NRow
;
++
i
)
{
// optimize for vector-4 load
if
(
NCol
%
4
==
0
)
{
using
vector_t
=
typename
vector_type
<
Float
,
4
>::
MemoryType
;
for
(
index_t
j
=
0
;
j
<
NCol
/
4
;
++
j
)
{
const
index_t
src_index
=
src_mtx
.
Get1dIndex
(
i
,
4
*
j
);
const
index_t
dst_index
=
dst_mtx
.
Get1dIndex
(
i
,
4
*
j
);
*
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
dst_index
])
=
*
reinterpret_cast
<
const
vector_t
*>
(
&
p_src
[
src_index
]);
}
}
else
{
{
for
(
index_t
j
=
0
;
j
<
NCol
;
++
j
)
for
(
index_t
j
=
0
;
j
<
NCol
;
++
j
)
{
{
...
@@ -38,7 +22,6 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
...
@@ -38,7 +22,6 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
p_dst
[
dst_index
]
=
p_src
[
src_index
];
p_dst
[
dst_index
]
=
p_src
[
src_index
];
}
}
}
}
}
}
}
template
<
class
MatrixA
,
template
<
class
MatrixA
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment