Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
b5b4fd28
"...composable_kernel-1.git" did not exist on "b62bf8c3f84b945b5a634afc59c175241998e205"
Commit
b5b4fd28
authored
Jan 21, 2019
by
Chao Liu
Browse files
refactor
parent
c64f63d5
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
271 additions
and
75 deletions
+271
-75
driver/conv.cu
driver/conv.cu
+3
-3
driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
+22
-23
driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
+12
-14
src/include/gemm.cuh
src/include/gemm.cuh
+206
-4
src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
...e/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
+5
-5
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
...e/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
+23
-26
No files found.
driver/conv.cu
View file @
b5b4fd28
...
...
@@ -354,10 +354,10 @@ int main()
{
#if 0
constexpr unsigned N = 1;
constexpr unsigned C =
2
;
constexpr unsigned C =
1
;
constexpr unsigned HI = 34;
constexpr unsigned WI = 34;
constexpr unsigned K =
2
;
constexpr unsigned K =
4
;
constexpr unsigned S = 3;
constexpr unsigned R = 3;
#elif
1
...
...
@@ -418,7 +418,7 @@ int main()
device_direct_convolution_2
#elif 0
device_implicit_gemm_convolution_1_nchw_kcsr
#elif
1
#elif
0
device_implicit_gemm_convolution_1_nchw_srck_nkhw
#elif 1
device_implicit_gemm_convolution_2_cnhw_srck_knhw
...
...
driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
View file @
b5b4fd28
...
...
@@ -4,12 +4,12 @@
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_implicit_gemm_convolution_1_nchw_srck_nkhw
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcsr
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
unsigned
nrepeat
)
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcsr
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
unsigned
nrepeat
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
...
@@ -104,7 +104,7 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
BlockSize
=
128
;
#elif
1
#elif
0
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
KPerBlock
=
32
;
constexpr
unsigned
CPerBlock
=
4
;
...
...
@@ -137,20 +137,20 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
cudaEventRecord
(
start
,
0
);
gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw
<
GridSize
,
BlockSize
,
T
,
decltype
(
in_nchw_desc
),
decltype
(
wei_srck_desc
),
decltype
(
out_nkhw_desc
),
NPerBlock
,
KPerBlock
,
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
KPerThread
,
CPerThread
,
HoPerThread
,
WoPerThread
>
BlockSize
,
T
,
decltype
(
in_nchw_desc
),
decltype
(
wei_srck_desc
),
decltype
(
out_nkhw_desc
),
NPerBlock
,
KPerBlock
,
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
KPerThread
,
CPerThread
,
HoPerThread
,
WoPerThread
>
<<<
grid_dim
,
block_dim
>>>
(
in_nchw_desc
,
static_cast
<
T
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
wei_srck_desc
,
...
...
@@ -165,10 +165,9 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
cudaEventElapsedTime
(
&
elapsedTime
,
start
,
stop
);
printf
(
"Elapsed time : %f ms
\n
"
,
elapsedTime
);
usleep
(
10
);
usleep
(
10
000
);
}
checkCudaErrors
(
cudaGetLastError
());
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
}
driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
View file @
b5b4fd28
#pragma once
#include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh"
#include <unistd.h>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_implicit_gemm_convolution_2_cnhw_srck_knhw
(
InDesc
,
...
...
@@ -67,35 +68,29 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
#if 0
constexpr unsigned BPerBlock = 128;
constexpr unsigned KPerBlock =
1
;
constexpr unsigned KPerBlock =
4
;
constexpr unsigned CPerBlock = 1;
constexpr unsigned BPerThread = 4;
constexpr unsigned KPerThread = 1;
constexpr unsigned CPerThread = 1;
constexpr unsigned BlockSize = 32;
#elif
0
constexpr
unsigned
BPerBlock
=
128
;
constexpr
unsigned
KPerBlock
=
2
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
BPerThread
=
4
;
constexpr
unsigned
KPerThread
=
2
;
constexpr
unsigned
CPerThread
=
1
;
constexpr unsigned ThreadPerClusterRow = 4;
constexpr unsigned ThreadPerClusterColumn = 16;
constexpr
unsigned
BlockSize
=
32
;
constexpr unsigned BlockSize =
128
;
#elif
1
constexpr
unsigned
BPerBlock
=
128
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
BPerBatch
=
32
;
constexpr
unsigned
BPerThread
=
4
;
constexpr
unsigned
KPerThread
=
16
;
constexpr
unsigned
CPerThread
=
1
;
constexpr
unsigned
ThreadPerClusterRow
=
4
;
constexpr
unsigned
ThreadPerClusterColumn
=
16
;
constexpr
unsigned
BlockSize
=
128
;
#endif
...
...
@@ -137,7 +132,8 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
BPerThread
,
KPerThread
,
CPerThread
,
BPerBatch
>
ThreadPerClusterRow
,
ThreadPerClusterColumn
>
<<<
grid_dim
,
block_dim
>>>
(
in_cnhw_desc
,
static_cast
<
T
*>
(
in_cnhw_device_buf
.
GetDeviceBuffer
()),
wei_srck_desc
,
...
...
@@ -151,6 +147,8 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
cudaEventElapsedTime
(
&
elapsedTime
,
start
,
stop
);
printf
(
"Elapsed time : %f ms
\n
"
,
elapsedTime
);
usleep
(
10000
);
}
checkCudaErrors
(
cudaGetLastError
());
...
...
src/include/gemm.cuh
View file @
b5b4fd28
...
...
@@ -156,11 +156,11 @@ struct blockwise_1d_strided_batched_gemm_block_a_block_b_thread_c
static_assert
(
MPerBlock
%
MPerThread
==
0
,
"MPerBlock % MPerThread != 0"
);
static_assert
(
NPerBlock
%
NPerThread
==
0
,
"NPerBlock % NPerThread != 0"
);
constexpr
unsigned
BThreadWork
=
(
BatchSize
+
BatchPerThread
-
1
)
/
BatchPerThread
;
constexpr
unsigned
MThreadWork
=
(
MPerBlock
+
MPerThread
-
1
)
/
MPerThread
;
constexpr
unsigned
NThreadWork
=
(
NPerBlock
+
NPerThread
-
1
)
/
NPerThread
;
constexpr
unsigned
B
atch
ThreadWork
=
(
BatchSize
+
BatchPerThread
-
1
)
/
BatchPerThread
;
constexpr
unsigned
MThreadWork
=
(
MPerBlock
+
MPerThread
-
1
)
/
MPerThread
;
constexpr
unsigned
NThreadWork
=
(
NPerBlock
+
NPerThread
-
1
)
/
NPerThread
;
static_assert
(
BlockSize
==
BThreadWork
*
MThreadWork
*
NThreadWork
,
static_assert
(
BlockSize
==
B
atch
ThreadWork
*
MThreadWork
*
NThreadWork
,
"wrong! wrong BlockSize"
);
if
(
DistributeThreadAlongColumnFirst
)
...
...
@@ -289,3 +289,205 @@ struct blockwise_1d_strided_batched_gemm_block_a_block_b_thread_c
}
}
};
template
<
unsigned
BlockSize
,
class
BlockMatrixA
,
class
BlockMatrixB
,
class
ThreadMatrixC
,
bool
TransA
,
bool
TransB
,
bool
TransC
,
unsigned
KPerThreadLoop
,
unsigned
MThreadPerCluster
,
unsigned
NThreadPerCluster
,
bool
DistributeThreadAlongColumnFirst
>
struct
blockwise_gemm_block_a_block_b_thread_c
{
unsigned
mMyThreadOffsetA
=
0
;
unsigned
mMyThreadOffsetB
=
0
;
struct
MatrixIndex
{
unsigned
row_begin
;
unsigned
col_begin
;
};
__device__
blockwise_gemm_block_a_block_b_thread_c
()
{
const
auto
a_block_mtx
=
BlockMatrixA
{};
// constexpr doesn't compile
const
auto
b_block_mtx
=
BlockMatrixB
{};
// constexpr doesn't compile
const
auto
c_thread_mtx_index
=
CalculateThreadMatrixCIndex
(
get_thread_local_1d_id
());
mMyThreadOffsetA
=
(
!
TransA
)
?
a_block_mtx
.
Get1dIndex
(
c_thread_mtx_index
.
row_begin
,
0
)
:
a_block_mtx
.
Get1dIndex
(
0
,
c_thread_mtx_index
.
row_begin
);
mMyThreadOffsetB
=
(
!
TransB
)
?
b_block_mtx
.
Get1dIndex
(
0
,
c_thread_mtx_index
.
col_begin
)
:
b_block_mtx
.
Get1dIndex
(
c_thread_mtx_index
.
col_begin
,
0
);
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantMatrixDescriptor(BlockMatrixA{}, "a_block_mtx: ");
print_ConstantMatrixDescriptor(BlockMatrixB{}, "b_block_mtx: ");
print_ConstantMatrixDescriptor(ThreadMatrixC{}, "c_thread_mtx: ");
printf("%u %u, %u %u %u, %u %u\n",
get_block_1d_id(),
get_thread_local_1d_id(),
c_thread_mtx_index.batch_begin,
c_thread_mtx_index.row_begin,
c_thread_mtx_index.col_begin,
mMyThreadOffsetA,
mMyThreadOffsetB);
}
#endif
}
__device__
MatrixIndex
CalculateThreadMatrixCIndex
(
unsigned
thread_id
)
const
{
if
(
TransA
&&
(
!
TransB
)
&&
(
!
TransC
))
{
const
auto
a_block_mtx
=
BlockMatrixA
{};
// constexpr doesn't compile
const
auto
b_block_mtx
=
BlockMatrixB
{};
// constexpr doesn't compile
static_assert
(
a_block_mtx
.
NRow
()
==
b_block_mtx
.
NRow
(),
"wrong! k dimension not consistent!"
);
constexpr
unsigned
MPerBlock
=
a_block_mtx
.
NCol
();
constexpr
unsigned
NPerBlock
=
b_block_mtx
.
NCol
();
const
auto
c_thread_mtx
=
ThreadMatrixC
{};
// constexpr doesn't compile
// divide thread work
constexpr
unsigned
MPerThread
=
c_thread_mtx
.
NRow
();
constexpr
unsigned
NPerThread
=
c_thread_mtx
.
NCol
();
static_assert
(
MPerBlock
%
(
MPerThread
*
MThreadPerCluster
)
==
0
,
"MPerBlock % (MPerThread * MThreadPerCluster) != 0"
);
static_assert
(
NPerBlock
%
(
NPerThread
*
NThreadPerCluster
)
==
0
,
"NPerBlock % (NPerThread * NThreadPerCluster) != 0"
);
constexpr
unsigned
MClusterWork
=
(
MPerBlock
+
MPerThread
*
MThreadPerCluster
-
1
)
/
(
MPerThread
*
MThreadPerCluster
);
constexpr
unsigned
NClusterWork
=
(
NPerBlock
+
NPerThread
*
NThreadPerCluster
-
1
)
/
(
NPerThread
*
NThreadPerCluster
);
static_assert
(
BlockSize
==
(
MClusterWork
*
MThreadPerCluster
)
*
(
NClusterWork
*
NThreadPerCluster
),
"wrong! wrong BlockSize"
);
if
(
DistributeThreadAlongColumnFirst
)
{
const
unsigned
cluster_work_block_id
=
thread_id
/
(
MThreadPerCluster
*
NThreadPerCluster
);
const
unsigned
thread_work_cluster_id
=
thread_id
-
cluster_work_block_id
*
(
MThreadPerCluster
*
NThreadPerCluster
);
const
unsigned
m_cluster_work_block_id
=
cluster_work_block_id
/
NThreadPerCluster
;
const
unsigned
n_cluster_work_block_id
=
cluster_work_block_id
-
m_cluster_work_block_id
*
NThreadPerCluster
;
const
unsigned
m_thread_work_cluster_id
=
thread_work_cluster_id
/
NThreadPerCluster
;
const
unsigned
n_thread_work_cluster_id
=
thread_work_cluster_id
-
m_thread_work_cluster_id
*
NThreadPerCluster
;
#if 0
if(get_block_1d_id() == 0)
{
printf("%u %u, \t"
//"MClusterWork %u MThreadPerCluster %u NClusterWork %u NThreadPerCluster %u \t"
"m_cluster_work_block_id %u n_cluster_work_block_id %u \t"
"m_thread_work_cluster_id %u n_thread_work_cluster_id %u \t"
"\n",
get_block_1d_id(), get_thread_local_1d_id(),
//MClusterWork, MThreadPerCluster, NClusterWork, NThreadPerCluster,
m_cluster_work_block_id, n_cluster_work_block_id,
m_thread_work_cluster_id, n_thread_work_cluster_id);
}
#endif
return
MatrixIndex
{
m_cluster_work_block_id
*
(
MThreadPerCluster
*
MPerThread
)
+
m_thread_work_cluster_id
*
MPerThread
,
n_cluster_work_block_id
*
(
NThreadPerCluster
*
NPerThread
)
+
n_thread_work_cluster_id
*
NPerThread
};
}
else
{
// not implemented
assert
(
false
);
}
}
else
{
// not implemented
assert
(
false
);
}
}
template
<
class
FloatA
,
class
FloatB
,
class
FloatC
,
class
Accumulator
>
__device__
void
run
(
FloatA
*
const
p_a_block
,
FloatB
*
const
p_b_block
,
FloatC
*
p_c_thread
,
Accumulator
f_accum
)
const
{
if
(
TransA
&&
(
!
TransB
)
&&
(
!
TransC
))
{
constexpr
auto
True
=
Constant
<
bool
,
true
>
{};
constexpr
auto
False
=
Constant
<
bool
,
false
>
{};
const
auto
a_block_mtx
=
BlockMatrixA
{};
// constexpr doesn't compile
const
auto
b_block_mtx
=
BlockMatrixB
{};
// constexpr doesn't compile
const
auto
c_thread_mtx
=
ThreadMatrixC
{};
// constexpr doesn't compile
constexpr
unsigned
KPerBlock
=
a_block_mtx
.
NRow
();
// A is transposed
constexpr
unsigned
MPerThread
=
c_thread_mtx
.
NRow
();
constexpr
unsigned
NPerThread
=
c_thread_mtx
.
NCol
();
// a is transposed, b is not
const
auto
a_thread_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
MPerThread
>
{});
// constexpr doesn't compile
const
auto
b_thread_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
NPerThread
>
{});
// constexpr doesn't compile
FloatA
p_a_thread
[
a_thread_mtx
.
GetElementSpace
()];
FloatB
p_b_thread
[
b_thread_mtx
.
GetElementSpace
()];
// loop over k
for
(
unsigned
k_begin
=
0
;
k_begin
<
KPerBlock
;
k_begin
+=
KPerThreadLoop
)
{
threadwise_matrix_copy
(
a_block_mtx
,
p_a_block
+
mMyThreadOffsetA
+
k_begin
*
a_block_mtx
.
RowStride
(),
a_thread_mtx
,
p_a_thread
,
a_thread_mtx
.
GetLengths
());
threadwise_matrix_copy
(
b_block_mtx
,
p_b_block
+
mMyThreadOffsetB
+
k_begin
*
b_block_mtx
.
RowStride
(),
b_thread_mtx
,
p_b_thread
,
b_thread_mtx
.
GetLengths
());
threadwise_gemm
(
a_thread_mtx
,
True
,
p_a_thread
,
b_thread_mtx
,
False
,
p_b_thread
,
c_thread_mtx
,
False
,
p_c_thread
,
f_accum
);
}
}
}
};
src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
View file @
b5b4fd28
...
...
@@ -23,11 +23,11 @@ template <unsigned GridSize,
unsigned
WoPerThread
>
__global__
void
gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw
(
InGlobalDesc
,
Float
*
const
__restrict__
p_in_global
,
WeiGlobalDesc
,
Float
*
const
__restrict__
p_wei_global
,
OutGlobalDesc
,
Float
*
__restrict__
p_out_global
)
Float
*
const
__restrict__
p_in_global
,
WeiGlobalDesc
,
Float
*
const
__restrict__
p_wei_global
,
OutGlobalDesc
,
Float
*
__restrict__
p_out_global
)
{
// NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N]
// for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N"
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
View file @
b5b4fd28
...
...
@@ -20,7 +20,8 @@ template <unsigned GridSize,
unsigned
BPerThread
,
unsigned
KPerThread
,
unsigned
CPerThread
,
unsigned
BPerBatch
>
unsigned
ThreadPerClusterRow
,
unsigned
ThreadPerClusterColumn
>
__global__
void
gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw
(
InGlobalDesc
,
Float
*
const
__restrict__
p_in_global
,
...
...
@@ -112,31 +113,26 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
const
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{});
// constexpr doesn't compile
static_assert
(
BPerBlock
%
BPerBatch
==
0
&&
BPerBatch
%
BPerThread
==
0
,
"B cannot be evenly divided
\n
"
);
const
auto
b_cxb_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
BPerB
atch
>
{},
Number
<
BPerB
lock
>
{},
Number
<
in_cb_block_desc
.
GetStride
(
I0
)
>
{});
// constexpr doesn't compile
const
auto
c_kxb_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
BPerThread
>
{});
// constexpr doesn't compile
const
auto
blockwise_batched_gemm
=
blockwise_1d_strided_batched_gemm_block_a_block_b_thread_c
<
BlockSize
,
decltype
(
a_cxk_block_mtx_desc
),
decltype
(
b_cxb_block_mtx_desc
),
decltype
(
c_kxb_thread_mtx_desc
),
true
,
false
,
false
,
0
,
BPerBatch
,
0
,
BPerBlock
/
BPerBatch
,
1
,
CPerThread
,
true
>
{};
const
auto
blockwise_gemm
=
blockwise_gemm_block_a_block_b_thread_c
<
BlockSize
,
decltype
(
a_cxk_block_mtx_desc
),
decltype
(
b_cxb_block_mtx_desc
),
decltype
(
c_kxb_thread_mtx_desc
),
true
,
false
,
false
,
CPerThread
,
ThreadPerClusterRow
,
ThreadPerClusterColumn
,
true
>
{};
// LDS
constexpr
unsigned
in_block_size
=
in_cb_block_desc
.
GetElementSpace
();
...
...
@@ -175,6 +171,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
__syncthreads
();
#if 1
// a series of GEMM
for
(
unsigned
s
=
0
;
s
<
S
;
++
s
)
{
...
...
@@ -182,31 +179,31 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
{
auto
f_accum
=
[](
auto
&
c
,
const
auto
&&
ab
)
{
c
+=
ab
;
};
blockwise_
batched_
gemm
.
run
(
p_wei_block
+
wei_srck_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
blockwise_gemm
.
run
(
p_wei_block
+
wei_srck_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
p_in_block
+
s
*
Wi
+
r
,
p_out_thread
,
f_accum
);
}
}
#endif
}
// output: register to global mem,
const
auto
matrix_c_index
=
blockwise_
batched_
gemm
.
CalculateThreadMatrixCIndex
(
get_thread_local_1d_id
());
blockwise_gemm
.
CalculateThreadMatrixCIndex
(
get_thread_local_1d_id
());
const
unsigned
k_thread_data_begin
=
matrix_c_index
.
row_begin
;
const
unsigned
b_thread_data_begin
=
matrix_c_index
.
batch_begin
*
BPerBatch
+
matrix_c_index
.
col_begin
;
const
unsigned
b_thread_data_begin
=
matrix_c_index
.
col_begin
;
const
unsigned
k_data_begin
=
k_block_data_begin
+
k_thread_data_begin
;
const
unsigned
b_data_begin
=
b_block_data_begin
+
b_thread_data_begin
;
#if 0
//
if(get_block_1d_id() ==
1
0)
if(get_block_1d_id() == 0)
{
printf("%u %u,
batch_begin %u
row_begin %u col_begin %u, k_data_begin %u b_data_begin %u, %f %f %f %f\n",
printf("%u %u, row_begin %u col_begin %u, k_data_begin %u b_data_begin %u, %f %f %f %f\n",
get_block_1d_id(),
get_thread_local_1d_id(),
matrix_c_index.batch_begin,
matrix_c_index.row_begin,
matrix_c_index.col_begin,
k_data_begin,
...
...
@@ -228,7 +225,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
unsigned
w_data
=
itmp
-
h_data
*
Wi
;
#if 0
if(get_block_1d_id() ==
1
0)
if(get_block_1d_id() == 0)
{
printf("%u %u, k %u b %u, k_data %u n_data %u h_data %u w_data %u %f\n",
get_block_1d_id(),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment