Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
e43d7bc6
Commit
e43d7bc6
authored
Apr 01, 2019
by
Chao Liu
Browse files
refactor
parent
d058d164
Changes
13
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
862 additions
and
917 deletions
+862
-917
driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
+9
-7
driver/driver.hip.cpp
driver/driver.hip.cpp
+1
-1
src/include/ConstantTensorDescriptor.hip.hpp
src/include/ConstantTensorDescriptor.hip.hpp
+4
-1
src/include/blockwise_gemm.hip.hpp
src/include/blockwise_gemm.hip.hpp
+363
-485
src/include/common.hip.hpp
src/include/common.hip.hpp
+34
-12
src/include/gridwise_direct_convolution_1.hip.hpp
src/include/gridwise_direct_convolution_1.hip.hpp
+5
-5
src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
...lude/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
+5
-4
src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
...se_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
+4
-4
src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
...idwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
+5
-4
src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
...implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
+4
-4
src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
...idwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
+221
-172
src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
...mm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+204
-196
src/include/threadwise_gemm.hip.hpp
src/include/threadwise_gemm.hip.hpp
+3
-22
No files found.
driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
View file @
e43d7bc6
...
@@ -270,7 +270,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
...
@@ -270,7 +270,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
{
float
time
=
launch_kernel
(
constexpr
auto
gridwise_conv
=
#if 1
#if 1
gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
#else
#else
...
@@ -301,12 +301,14 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
...
@@ -301,12 +301,14 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
WeiBlockCopyThreadPerDim0
,
WeiBlockCopyThreadPerDim0
,
WeiBlockCopyThreadPerDim1
,
WeiBlockCopyThreadPerDim1
,
InBlockCopyDataPerRead
,
InBlockCopyDataPerRead
,
WeiBlockCopyDataPerRead
>
,
WeiBlockCopyDataPerRead
>
();
dim3
(
GridSize
),
dim3
(
BlockSize
),
float
time
=
launch_kernel
(
gridwise_conv
.
Run
,
static_cast
<
T
*>
(
in_chwn_device_buf
.
GetDeviceBuffer
()),
dim3
(
GridSize
),
static_cast
<
T
*>
(
wei_cyxk_device_buf
.
GetDeviceBuffer
()),
dim3
(
BlockSize
),
static_cast
<
T
*>
(
out_khwn_device_buf
.
GetDeviceBuffer
()));
static_cast
<
T
*>
(
in_chwn_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_cyxk_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_khwn_device_buf
.
GetDeviceBuffer
()));
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
...
...
driver/driver.hip.cpp
View file @
e43d7bc6
...
@@ -580,7 +580,7 @@ int main(int argc, char* argv[])
...
@@ -580,7 +580,7 @@ int main(int argc, char* argv[])
constexpr
index_t
HPad
=
0
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif
0
#elif
1
// 1x1 filter, 14x14 image, C = 2048
// 1x1 filter, 14x14 image, C = 2048
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
2048
;
constexpr
index_t
C
=
2048
;
...
...
src/include/ConstantTensorDescriptor.hip.hpp
View file @
e43d7bc6
...
@@ -137,7 +137,10 @@ struct ConstantTensorDescriptor
...
@@ -137,7 +137,10 @@ struct ConstantTensorDescriptor
}
}
};
};
return
static_const_reduce_n
<
nDim
>
{}(
GetElementSpace_f
{},
add
{})
+
align
.
Get
();
index_t
element_space_unaligned
=
static_const_reduce_n
<
nDim
>
{}(
GetElementSpace_f
{},
add
{})
+
1
;
return
align
.
Get
()
*
((
element_space_unaligned
+
align
.
Get
()
-
1
)
/
align
.
Get
());
}
}
template
<
class
...
Is
>
template
<
class
...
Is
>
...
...
src/include/blockwise_gemm.hip.hpp
View file @
e43d7bc6
This diff is collapsed.
Click to expand it.
src/include/common.hip.hpp
View file @
e43d7bc6
...
@@ -5,8 +5,6 @@
...
@@ -5,8 +5,6 @@
#include "Array.hip.hpp"
#include "Array.hip.hpp"
#include "functional.hip.hpp"
#include "functional.hip.hpp"
extern
"C"
__attribute__
((
address_space
(
3
)))
void
*
__to_local
(
void
*
p
)[[
hc
]];
__device__
index_t
get_thread_local_1d_id
()
{
return
threadIdx
.
x
;
}
__device__
index_t
get_thread_local_1d_id
()
{
return
threadIdx
.
x
;
}
__device__
index_t
get_block_1d_id
()
{
return
blockIdx
.
x
;
}
__device__
index_t
get_block_1d_id
()
{
return
blockIdx
.
x
;
}
...
@@ -23,21 +21,45 @@ struct is_same<T, T>
...
@@ -23,21 +21,45 @@ struct is_same<T, T>
static
const
bool
value
=
true
;
static
const
bool
value
=
true
;
};
};
#if DEVICE_BACKEND_CUDA
__host__
__device__
constexpr
index_t
integer_divide_ceil
(
index_t
a
,
index_t
b
)
template
<
typename
T
>
__host__
__device__
constexpr
T
max
(
T
a
,
T
b
)
{
{
return
a
>
b
?
a
:
b
;
return
(
a
+
b
-
1
)
/
b
;
}
}
template
<
typename
T
>
namespace
mod_conv
{
__host__
__device__
constexpr
T
min
(
T
a
,
T
b
)
template
<
class
T
>
__host__
__device__
constexpr
T
max
(
T
x
,
T
y
)
{
{
return
a
<
b
?
a
:
b
;
return
x
>
y
?
x
:
y
;
}
}
#endif
__host__
__device__
constexpr
index_t
integer_divide_ceil
(
index_t
a
,
index_t
b
)
template
<
class
T
,
class
...
Ts
>
__host__
__device__
constexpr
T
max
(
T
x
,
Ts
...
xs
)
{
{
return
(
a
+
b
-
1
)
/
b
;
static_assert
(
sizeof
...(
xs
)
>
0
,
"not enough argument"
);
auto
y
=
max
(
xs
...);
static_assert
(
is_same
<
decltype
(
y
),
T
>::
value
,
"not the same type"
);
return
x
>
y
?
x
:
y
;
}
template
<
class
T
>
__host__
__device__
constexpr
T
min
(
T
x
,
T
y
)
{
return
x
<
y
?
x
:
y
;
}
template
<
class
T
,
class
...
Ts
>
__host__
__device__
constexpr
T
min
(
T
x
,
Ts
...
xs
)
{
static_assert
(
sizeof
...(
xs
)
>
0
,
"not enough argument"
);
auto
y
=
min
(
xs
...);
static_assert
(
is_same
<
decltype
(
y
),
T
>::
value
,
"not the same type"
);
return
x
<
y
?
x
:
y
;
}
}
}
src/include/gridwise_direct_convolution_1.hip.hpp
View file @
e43d7bc6
...
@@ -59,12 +59,12 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
...
@@ -59,12 +59,12 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
constexpr
auto
out_block_desc
=
constexpr
auto
out_block_desc
=
make_ConstantTensorDescriptor
(
out_block_global_desc
.
GetLengths
());
make_ConstantTensorDescriptor
(
out_block_global_desc
.
GetLengths
());
constexpr
index_t
in_block_size
=
in_block_desc
.
GetElementSpace
();
constexpr
index_t
in_block_
element_
size
=
in_block_desc
.
GetElementSpace
();
constexpr
index_t
wei_block_size
=
wei_block_desc
.
GetElementSpace
();
constexpr
index_t
wei_block_
element_
size
=
wei_block_desc
.
GetElementSpace
();
constexpr
index_t
out_block_size
=
out_block_desc
.
GetElementSpace
();
constexpr
index_t
out_block_size
=
out_block_desc
.
GetElementSpace
();
__shared__
Float
p_in_block
[
in_block_size
];
__shared__
Float
p_in_block
[
in_block_
element_
size
];
__shared__
Float
p_wei_block
[
wei_block_size
];
__shared__
Float
p_wei_block
[
wei_block_
element_
size
];
__shared__
Float
p_out_block
[
out_block_size
];
__shared__
Float
p_out_block
[
out_block_size
];
const
index_t
block_id
=
blockIdx
.
x
;
const
index_t
block_id
=
blockIdx
.
x
;
...
...
src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
View file @
e43d7bc6
...
@@ -63,17 +63,18 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
...
@@ -63,17 +63,18 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
Sequence
<
wei_ke_block_desc
.
GetStride
(
I0
),
Y
*
X
,
X
,
1
>
{});
Sequence
<
wei_ke_block_desc
.
GetStride
(
I0
),
Y
*
X
,
X
,
1
>
{});
// shared mem
// shared mem
constexpr
index_t
in_block_size
=
constexpr
index_t
in_block_
element_
size
=
in_nchw_block_desc
.
GetElementSpace
(
Number
<
InBlockCopyDataPerRead
>
{});
in_nchw_block_desc
.
GetElementSpace
(
Number
<
InBlockCopyDataPerRead
>
{});
constexpr
index_t
wei_block_size
=
constexpr
index_t
wei_block_
element_
size
=
wei_kcyx_block_desc
.
GetElementSpace
(
Number
<
WeiBlockCopyDataPerRead
>
{});
wei_kcyx_block_desc
.
GetElementSpace
(
Number
<
WeiBlockCopyDataPerRead
>
{});
constexpr
index_t
max_align
=
InBlockCopyDataPerRead
>
WeiBlockCopyDataPerRead
constexpr
index_t
max_align
=
InBlockCopyDataPerRead
>
WeiBlockCopyDataPerRead
?
InBlockCopyDataPerRead
?
InBlockCopyDataPerRead
:
WeiBlockCopyDataPerRead
;
:
WeiBlockCopyDataPerRead
;
__shared__
Float
p_in_block
[
max_align
*
((
in_block_size
+
max_align
-
1
)
/
max_align
)];
__shared__
Float
p_in_block
[
max_align
*
((
in_block_element_size
+
max_align
-
1
)
/
max_align
)];
__shared__
Float
p_wei_block
[
max_align
*
((
wei_block_size
+
max_align
-
1
)
/
max_align
)];
__shared__
Float
p_wei_block
[
max_align
*
((
wei_block_element_size
+
max_align
-
1
)
/
max_align
)];
// threadwise tensors
// threadwise tensors
constexpr
index_t
HiPerThread
=
HoPerThread
+
Y
-
1
;
constexpr
index_t
HiPerThread
=
HoPerThread
+
Y
-
1
;
...
...
src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
View file @
e43d7bc6
...
@@ -73,10 +73,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
...
@@ -73,10 +73,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
Sequence
<
wei_ke_vec_block_desc
.
GetStride
(
I0
),
Y
*
X
,
X
,
1
>
{});
Sequence
<
wei_ke_vec_block_desc
.
GetStride
(
I0
),
Y
*
X
,
X
,
1
>
{});
// shared mem
// shared mem
constexpr
index_t
in_block_size
=
constexpr
index_t
in_block_
element_
size
=
in_nchw_vec_block_desc
.
GetElementSpace
(
Number
<
InBlockCopyDataPerRead
>
{});
in_nchw_vec_block_desc
.
GetElementSpace
(
Number
<
InBlockCopyDataPerRead
>
{});
constexpr
index_t
wei_block_size
=
constexpr
index_t
wei_block_
element_
size
=
wei_kcyx_vec_block_desc
.
GetElementSpace
(
Number
<
WeiBlockCopyDataPerRead
>
{});
wei_kcyx_vec_block_desc
.
GetElementSpace
(
Number
<
WeiBlockCopyDataPerRead
>
{});
constexpr
index_t
max_align
=
InBlockCopyDataPerRead
>
WeiBlockCopyDataPerRead
constexpr
index_t
max_align
=
InBlockCopyDataPerRead
>
WeiBlockCopyDataPerRead
...
@@ -84,9 +84,9 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
...
@@ -84,9 +84,9 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
:
WeiBlockCopyDataPerRead
;
:
WeiBlockCopyDataPerRead
;
__shared__
in_vector_mem_t
__shared__
in_vector_mem_t
p_in_vec_block
[
max_align
*
((
in_block_size
+
max_align
-
1
)
/
max_align
)];
p_in_vec_block
[
max_align
*
((
in_block_
element_
size
+
max_align
-
1
)
/
max_align
)];
__shared__
in_vector_mem_t
__shared__
in_vector_mem_t
p_wei_vec_block
[
max_align
*
((
wei_block_size
+
max_align
-
1
)
/
max_align
)];
p_wei_vec_block
[
max_align
*
((
wei_block_
element_
size
+
max_align
-
1
)
/
max_align
)];
// threadwise tensors
// threadwise tensors
constexpr
index_t
HiPerThread
=
HoPerThread
+
Y
-
1
;
constexpr
index_t
HiPerThread
=
HoPerThread
+
Y
-
1
;
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
View file @
e43d7bc6
...
@@ -164,18 +164,19 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
...
@@ -164,18 +164,19 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
HoPerThread
>
{};
HoPerThread
>
{};
// LDS: be careful of alignment
// LDS: be careful of alignment
constexpr
index_t
in_block_size
=
constexpr
index_t
in_block_
element_
size
=
in_chwn_block_desc
.
GetElementSpace
(
Number
<
InBlockCopyDataPerRead
>
{});
in_chwn_block_desc
.
GetElementSpace
(
Number
<
InBlockCopyDataPerRead
>
{});
constexpr
index_t
wei_block_size
=
constexpr
index_t
wei_block_
element_
size
=
wei_cyxk_block_desc
.
GetElementSpace
(
Number
<
WeiBlockCopyDataPerRead
>
{});
wei_cyxk_block_desc
.
GetElementSpace
(
Number
<
WeiBlockCopyDataPerRead
>
{});
constexpr
index_t
max_align
=
InBlockCopyDataPerRead
>
WeiBlockCopyDataPerRead
constexpr
index_t
max_align
=
InBlockCopyDataPerRead
>
WeiBlockCopyDataPerRead
?
InBlockCopyDataPerRead
?
InBlockCopyDataPerRead
:
WeiBlockCopyDataPerRead
;
:
WeiBlockCopyDataPerRead
;
__shared__
Float
p_in_block
[
max_align
*
((
in_block_size
+
max_align
-
1
)
/
max_align
)];
__shared__
Float
p_in_block
[
max_align
*
((
in_block_element_size
+
max_align
-
1
)
/
max_align
)];
__shared__
Float
p_wei_block
[
max_align
*
((
wei_block_size
+
max_align
-
1
)
/
max_align
)];
__shared__
Float
p_wei_block
[
max_align
*
((
wei_block_element_size
+
max_align
-
1
)
/
max_align
)];
// register
// register
Float
p_out_thread
[
out_khwn_thread_desc
.
GetElementSpace
()];
Float
p_out_thread
[
out_khwn_thread_desc
.
GetElementSpace
()];
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
View file @
e43d7bc6
...
@@ -204,11 +204,11 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
...
@@ -204,11 +204,11 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
true
>
{};
true
>
{};
// LDS
// LDS
constexpr
index_t
in_block_size
=
in_chwn_block_desc
.
GetElementSpace
();
constexpr
index_t
in_block_
element_
size
=
in_chwn_block_desc
.
GetElementSpace
();
constexpr
index_t
wei_block_size
=
wei_cyxk_block_desc
.
GetElementSpace
();
constexpr
index_t
wei_block_
element_
size
=
wei_cyxk_block_desc
.
GetElementSpace
();
__shared__
Float
p_in_block
[
in_block_size
];
__shared__
Float
p_in_block
[
in_block_
element_
size
];
__shared__
Float
p_wei_block
[
wei_block_size
];
__shared__
Float
p_wei_block
[
wei_block_
element_
size
];
// register
// register
Float
p_out_thread
[
out_hkwn_thread_desc
.
GetElementSpace
()];
Float
p_out_thread
[
out_hkwn_thread_desc
.
GetElementSpace
()];
...
...
src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
View file @
e43d7bc6
This diff is collapsed.
Click to expand it.
src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
View file @
e43d7bc6
This diff is collapsed.
Click to expand it.
src/include/threadwise_gemm.hip.hpp
View file @
e43d7bc6
...
@@ -10,11 +10,9 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
...
@@ -10,11 +10,9 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
constexpr
auto
src_mtx
=
SrcMatrix
{};
constexpr
auto
src_mtx
=
SrcMatrix
{};
constexpr
auto
dst_mtx
=
DstMatrix
{};
constexpr
auto
dst_mtx
=
DstMatrix
{};
#if 1
#if 0
//NRow = 1
for(index_t i = 0; i < NRow; ++i)
for(index_t i = 0; i < NRow; ++i)
{
{
//NCol = 4
for(index_t j = 0; j < NCol; ++j)
for(index_t j = 0; j < NCol; ++j)
{
{
const index_t src_index = src_mtx.Get1dIndex(i, j);
const index_t src_index = src_mtx.Get1dIndex(i, j);
...
@@ -23,7 +21,7 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
...
@@ -23,7 +21,7 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
p_dst[dst_index] = p_src[src_index];
p_dst[dst_index] = p_src[src_index];
}
}
}
}
#elif
0
#elif
1
static_assert
(
NCol
==
4
,
"only for NCol == 4"
);
static_assert
(
NCol
==
4
,
"only for NCol == 4"
);
using
vector_t
=
typename
vector_type
<
Float
,
4
>::
MemoryType
;
using
vector_t
=
typename
vector_type
<
Float
,
4
>::
MemoryType
;
...
@@ -33,22 +31,8 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
...
@@ -33,22 +31,8 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
const
index_t
src_index
=
src_mtx
.
Get1dIndex
(
i
,
0
);
const
index_t
src_index
=
src_mtx
.
Get1dIndex
(
i
,
0
);
const
index_t
dst_index
=
dst_mtx
.
Get1dIndex
(
i
,
0
);
const
index_t
dst_index
=
dst_mtx
.
Get1dIndex
(
i
,
0
);
#if 0
*
(
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
dst_index
]))
=
*(reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
*
(
reinterpret_cast
<
const
vector_t
*>
(
&
p_src
[
src_index
]));
*
(
reinterpret_cast
<
const
vector_t
*>
(
&
p_src
[
src_index
]));
#elif
0
asm
volatile
(
"
\n
\
ds_read2_b64 %0, %1 offset1:1
\n
\
s_waitcnt lgkmcnt(0)"
:
"=v"
(
*
(
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
dst_index
])))
:
"v"
(
__to_local
((
void
*
)(
&
p_src
[
src_index
]))));
#elif 1
asm
volatile
(
"
\n
\
ds_read_b128 %0, %1
\n
\
s_waitcnt lgkmcnt(0)"
:
"=v"
(
*
(
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
dst_index
])))
:
"v"
(
__to_local
((
void
*
)(
&
p_src
[
src_index
]))));
#endif
}
}
#endif
#endif
}
}
...
@@ -84,13 +68,10 @@ __device__ void threadwise_gemm(MatrixA,
...
@@ -84,13 +68,10 @@ __device__ void threadwise_gemm(MatrixA,
constexpr
index_t
N
=
c_mtx
.
NCol
();
constexpr
index_t
N
=
c_mtx
.
NCol
();
constexpr
index_t
K
=
a_mtx
.
NRow
();
// A is transposed
constexpr
index_t
K
=
a_mtx
.
NRow
();
// A is transposed
// K = 1
for
(
index_t
k
=
0
;
k
<
K
;
++
k
)
for
(
index_t
k
=
0
;
k
<
K
;
++
k
)
{
{
// M = 8
for
(
index_t
i
=
0
;
i
<
M
;
++
i
)
for
(
index_t
i
=
0
;
i
<
M
;
++
i
)
{
{
// N = 8
for
(
index_t
j
=
0
;
j
<
N
;
++
j
)
for
(
index_t
j
=
0
;
j
<
N
;
++
j
)
{
{
const
index_t
aindex
=
a_mtx
.
Get1dIndex
(
k
,
i
);
// A is transposed
const
index_t
aindex
=
a_mtx
.
Get1dIndex
(
k
,
i
);
// A is transposed
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment