Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
efd419ec
"...resnet50_tensorflow.git" did not exist on "e1df7597a75b70f13a6f368f9a7c0dadeaf80101"
Commit
efd419ec
authored
Jul 29, 2019
by
Chao Liu
Browse files
refactored implicit gemm v1r3
parent
9ba3b491
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
96 additions
and
117 deletions
+96
-117
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
...ridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
+57
-77
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
...n_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
+7
-21
composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
...ernel/include/tensor_operation/blockwise_batched_gemm.hpp
+18
-5
driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
...de/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
+3
-3
driver/src/driver.cpp
driver/src/driver.cpp
+8
-8
script/cmake-cuda.sh
script/cmake-cuda.sh
+3
-3
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
View file @
efd419ec
...
...
@@ -83,7 +83,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
constexpr
index_t
HBlockWork
=
math
::
integer_divide_ceil
(
Ho
,
HoPerBlock
);
constexpr
index_t
WBlockWork
=
math
::
integer_divide_ceil
(
Wo
,
WoPerBlock
);
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor
(
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor
_packed
(
Sequence
<
NBlockWork
,
KBlockWork
,
HBlockWork
,
WBlockWork
>
{});
const
auto
block_work_multi_id
=
...
...
@@ -109,8 +109,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
GemmDataPerReadB
);
constexpr
auto
in_c_h_w_n_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerBlock
>
{},
Number
<
InBlockCopyDataPerRead_N
>
{});
Sequence
<
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerBlock
>
{},
Number
<
max_align
>
{});
// this check is ad-hoc
// TODO: need to properly implement tensor descriptor with alignment
...
...
@@ -118,11 +117,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
"GemmDataPerReadB alignment requirement is not meet"
);
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
math
::
lcm
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
max_align
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor
(
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor
_packed
(
Sequence
<
KPerThread
,
HoPerThread
,
WoPerThread
,
NPerThread
>
{});
// blockwise copy
...
...
@@ -144,7 +142,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
decltype
(
wei_c_k_global_desc
),
decltype
(
wei_c_k_block_desc
),
decltype
(
wei_c_k_block_desc
.
GetLengths
()),
WeiBlockCopyDataPerRead_K
>
{}
;
WeiBlockCopyDataPerRead_K
>
({
0
,
0
},
{
0
,
0
})
;
// a series of blockwise batched GEMM
// C_matrix += transpose(A_matrix) * B_matrix
...
...
@@ -186,22 +184,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
GemmDataPerReadA
,
GemmDataPerReadB
>
{};
// choose GEMM implementation here
const
auto
run_blockwise_batch_gemm
=
[
&
](
auto
...
Xs
)
{
#if 0
return blockwise_batch_gemm.Run(Xs...);
#elif
0
return
blockwise_batch_gemm
.
Run_amd_asm
(
Xs
...);
#else
return
blockwise_batch_gemm
.
Run_asm_v2
(
Xs
...);
#endif
};
// LDS: be careful of alignment
// TODO:: need to properly implement tensor descriptor with alignment
constexpr
index_t
in_block_space
=
in_c_h_w_n_block_desc
.
GetElementSpace
(
Number
<
max_align
>
{});
constexpr
index_t
wei_block_space
=
wei_c_k_block_desc
.
GetElementSpace
(
Number
<
max_align
>
{});
constexpr
index_t
in_block_space
=
in_c_h_w_n_block_desc
.
GetElementSpace
();
constexpr
index_t
wei_block_space
=
wei_c_k_block_desc
.
GetElementSpace
();
__shared__
Float
p_in_block
[
in_block_space
];
__shared__
Float
p_wei_block
[
wei_block_space
];
...
...
@@ -225,7 +210,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
#endif
// set threadwise output tensor to 0
threadwise_
4d_tensor
_set_zero
(
out_k_h
_w
_
n_thread_desc
,
p_out_thread
);
threadwise_
matrix
_set_zero
(
c_k
_wn_thread_
mtx_
desc
,
p_out_thread
);
#if 1
const
Float
*
p_in_global_block_offset
=
...
...
@@ -258,7 +243,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
__syncthreads
();
run_
blockwise_batch_gemm
(
p_wei_block
,
p_in_block
,
p_out_thread
);
blockwise_batch_gemm
.
Run
(
p_wei_block
,
p_in_block
,
p_out_thread
);
__syncthreads
();
}
...
...
@@ -291,7 +276,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
__syncthreads
();
run_
blockwise_batch_gemm
(
p_wei_block
,
p_in_block
,
p_out_thread
);
blockwise_batch_gemm
.
Run
(
p_wei_block
,
p_in_block
,
p_out_thread
);
__syncthreads
();
}
...
...
@@ -308,13 +293,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
const
index_t
wo_thread_data_begin
=
c_thread_mtx_begin
.
col
/
NPerBlock
;
const
index_t
n_thread_data_begin
=
c_thread_mtx_begin
.
col
%
NPerBlock
;
static_if
<
GemmNPerThreadSubC
<=
NPerBlock
>
{}([
&
](
auto
f_dummy
)
{
// f_dummy do nothing but
// perfect forwarding.
// Using this trick to
// make this lambda a generic lambda, so it won't be compiled until
// instantiated
static_if
<
GemmNPerThreadSubC
<=
NPerBlock
>
{}([
&
](
auto
fwd
)
{
// fwd do nothing but perfect forwarding.
// Using this trick to make this lambda a generic lambda, so it won't be compiled until
// being instantiated here
static_assert
(
(
f
_dummy
(
GemmNPerThreadSubC
)
<=
NPerBlock
&&
NPerBlock
%
GemmNPerThreadSubC
==
0
),
(
f
wd
(
GemmNPerThreadSubC
)
<=
NPerBlock
&&
NPerBlock
%
GemmNPerThreadSubC
==
0
),
"wrong!"
);
// output is a 10d tensor
...
...
@@ -322,37 +306,32 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
constexpr
index_t
N1
=
NPerBlock
/
N2
;
constexpr
index_t
W2
=
(
GemmNLevel0Cluster
*
GemmNLevel1Cluster
)
/
f
_dummy
(
NPerBlock
/
GemmNPerThreadSubC
);
(
GemmNLevel0Cluster
*
GemmNLevel1Cluster
)
/
f
wd
(
NPerBlock
/
GemmNPerThreadSubC
);
constexpr
index_t
W1
=
WoPerBlock
/
W2
;
constexpr
index_t
K2
=
GemmMPerThreadSubC
;
constexpr
index_t
K1
=
KPerBlock
/
KPerThread
;
constexpr
auto
out_10d_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
/
(
K1
*
K2
),
K1
,
K2
,
Ho
,
Wo
/
(
W1
*
W2
),
W1
,
W2
,
N
/
f_dummy
(
N1
*
N2
),
N1
,
N2
>
{});
constexpr
auto
out_10d_thread_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
/
K2
,
1
,
K2
,
HoPerThread
,
1
,
W1
,
1
,
1
,
1
,
N2
>
{});
constexpr
auto
out_10d_global_desc
=
fwd
(
out_k_h_w_n_global_desc
)
.
Fold
(
I3
,
Number
<
N1
>
{},
Number
<
N2
>
{})
.
Fold
(
I2
,
Number
<
W1
>
{},
Number
<
W2
>
{})
.
Fold
(
I0
,
Number
<
K1
>
{},
Number
<
K2
>
{});
constexpr
auto
out_10d_thread_desc
=
fwd
(
out_k_h_w_n_thread_desc
)
.
Fold
(
I3
,
Number
<
1
>
{},
Number
<
N2
>
{})
.
Fold
(
I2
,
Number
<
W1
>
{},
Number
<
1
>
{})
.
Fold
(
I0
,
Number
<
1
>
{},
Number
<
K2
>
{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc,
"
out_k_h_w_n_thread_desc");
print_ConstantTensorDescriptor(out_10d_thread_desc, "out_10d_thread_desc");
"a:
out_k_h_w_n_thread_desc");
print_ConstantTensorDescriptor(out_10d_thread_desc, "
a:
out_10d_thread_desc");
print_ConstantTensorDescriptor(out_k_h_w_n_global_desc,
"
out_k_h_w_n_global_desc");
print_ConstantTensorDescriptor(out_10d_global_desc, "out_10d_global_desc");
"a:
out_k_h_w_n_global_desc");
print_ConstantTensorDescriptor(out_10d_global_desc, "
a:
out_10d_global_desc");
}
#endif
...
...
@@ -367,8 +346,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
n_block_data_begin
+
n_thread_data_begin
),
out_10d_thread_desc
.
GetLengths
(),
Number
<
OutThreadCopyDataPerWrite_N
>
{});
}).
Else
([
&
](
auto
f
_dummy
)
{
static_assert
(
f
_dummy
(
GemmNPerThreadSubC
)
>=
NPerBlock
&&
NPerThread
==
NPerBlock
&&
}).
Else
([
&
](
auto
f
wd
)
{
static_assert
(
f
wd
(
GemmNPerThreadSubC
)
>=
NPerBlock
&&
NPerThread
==
NPerBlock
&&
GemmNPerThreadSubC
%
NPerThread
==
0
,
"wrong!"
);
...
...
@@ -377,32 +356,33 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
constexpr
index_t
W3
=
GemmNPerThreadSubC
/
NPerBlock
;
constexpr
index_t
W2
=
GemmNLevel0Cluster
*
GemmNLevel1Cluster
;
constexpr
index_t
W1
=
WoPerBlock
/
f
_dummy
(
W2
*
W3
);
constexpr
index_t
W1
=
WoPerBlock
/
f
wd
(
W2
*
W3
);
constexpr
index_t
K2
=
GemmMPerThreadSubC
;
constexpr
index_t
K1
=
KPerBlock
/
KPerThread
;
constexpr
auto
out_10d_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
/
(
K1
*
K2
),
K1
,
K2
,
Ho
,
Wo
/
(
W1
*
W2
*
W3
),
W1
,
W2
,
W3
,
N
/
N1
,
N1
>
{});
constexpr
auto
out_10d_global_desc
=
fwd
(
out_k_h_w_n_global_desc
)
.
Fold
(
I3
,
Number
<
N1
>
{})
.
Fold
(
I2
,
Number
<
W1
>
{},
Number
<
W2
>
{},
Number
<
W3
>
{})
.
Fold
(
I0
,
Number
<
K1
>
{},
Number
<
K2
>
{});
constexpr
auto
out_10d_thread_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
/
K2
,
1
,
K2
,
HoPerThread
,
1
,
W1
,
1
,
W3
,
1
,
N1
>
{});
constexpr
auto
out_10d_thread_desc
=
fwd
(
out_k_h_w_n_thread_desc
)
.
Fold
(
I3
,
Number
<
N1
>
{})
.
Fold
(
I2
,
Number
<
W1
>
{},
Number
<
1
>
{},
Number
<
W3
>
{})
.
Fold
(
I0
,
Number
<
1
>
{},
Number
<
K2
>
{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc,
"
out_k_h_w_n_thread_desc");
print_ConstantTensorDescriptor(out_10d_thread_desc, "out_10d_thread_desc");
"b:
out_k_h_w_n_thread_desc");
print_ConstantTensorDescriptor(out_10d_thread_desc, "
b:
out_10d_thread_desc");
print_ConstantTensorDescriptor(out_k_h_w_n_global_desc,
"out_k_h_w_n_global_desc");
print_ConstantTensorDescriptor(out_10d_global_desc, "out_10d_global_desc");
for(index_t i = 0; i < 64; ++i)
{
printf("out %f, ", p_out_thread[i]);
}
"b: out_k_h_w_n_global_desc");
print_ConstantTensorDescriptor(out_10d_global_desc, "b: out_10d_global_desc");
}
#endif
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
View file @
efd419ec
...
...
@@ -114,8 +114,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
GemmDataPerReadB
);
constexpr
auto
in_c_h_w_n_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerBlock
>
{},
Number
<
InBlockCopyDataPerRead_N
>
{});
Sequence
<
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerBlock
>
{},
Number
<
max_align
>
{});
// this check is ad-hoc
// TODO: need to properly implement tensor descriptor with alignment
...
...
@@ -123,8 +122,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
"GemmDataPerReadB alignment requirement is not meet"
);
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
math
::
lcm
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
max_align
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor_packed
(
...
...
@@ -201,21 +199,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
GemmDataPerReadA
,
GemmDataPerReadB
>
{};
// choose GEMM implementation here
const
auto
run_blockwise_batch_gemm
=
[
&
](
auto
...
Xs
)
{
#if 1
return
blockwise_batch_gemm
.
Run
(
Xs
...);
#elif 0
return
blockwise_batch_gemm
.
Run_amd_asm
(
Xs
...);
#else
return
blockwise_batch_gemm
.
Run_asm_v2
(
Xs
...);
#endif
};
// LDS: be careful of alignment
constexpr
index_t
in_block_space
=
in_c_h_w_n_block_desc
.
GetElementSpace
(
Number
<
max_align
>
{});
constexpr
index_t
wei_block_space
=
wei_c_k_block_desc
.
GetElementSpace
(
Number
<
max_align
>
{});
constexpr
index_t
in_block_space
=
in_c_h_w_n_block_desc
.
GetElementSpace
();
constexpr
index_t
wei_block_space
=
wei_c_k_block_desc
.
GetElementSpace
();
// LDS double buffer
__shared__
Float
p_in_block_double
[
2
*
in_block_space
];
...
...
@@ -307,7 +293,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
blockwise_wei_copy
.
RunLoadRegisterClipboard
(
p_wei_global_block_offset
,
p_wei_register_clipboard
);
run_
blockwise_batch_gemm
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
blockwise_batch_gemm
.
Run
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
RunStoreRegisterClipboard
(
p_in_register_clipboard
,
...
...
@@ -335,7 +321,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
p_wei_register_clipboard
);
// LDS double buffer: GEMM on current data
run_
blockwise_batch_gemm
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
blockwise_batch_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
RunStoreRegisterClipboard
(
p_in_register_clipboard
,
...
...
@@ -347,7 +333,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
__syncthreads
();
// LDS double buffer: GEMM on current data
run_
blockwise_batch_gemm
(
p_wei_block_double
+
wei_block_space
,
blockwise_batch_gemm
.
Run
(
p_wei_block_double
+
wei_block_space
,
p_in_block_double
+
in_block_space
,
p_out_thread
);
}
...
...
composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
View file @
efd419ec
...
...
@@ -170,7 +170,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
}
template
<
class
FloatA
,
class
FloatB
,
class
FloatC
>
__device__
void
Run
(
const
FloatA
*
__restrict__
p_a_block
,
__device__
void
Run
_source
(
const
FloatA
*
__restrict__
p_a_block
,
const
FloatB
*
__restrict__
p_b_block
,
FloatC
*
__restrict__
p_c_thread
)
const
{
...
...
@@ -189,10 +189,10 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
// thread A, B for GEMM
// A is transposed, b is not
constexpr
auto
a_thread_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
MPerThread
>
{});
make_ConstantMatrixDescriptor
_packed
(
Number
<
KPerThreadLoop
>
{},
Number
<
MPerThread
>
{});
constexpr
auto
b_thread_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
NPerThread
>
{});
make_ConstantMatrixDescriptor
_packed
(
Number
<
KPerThreadLoop
>
{},
Number
<
NPerThread
>
{});
// thread A-sub, B-sub for copy
constexpr
auto
a_thread_sub_mtx
=
make_ConstantMatrixDescriptor
(
...
...
@@ -480,6 +480,19 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
}
#endif
template
<
class
FloatA
,
class
FloatB
,
class
FloatC
>
__device__
void
Run
(
const
FloatA
*
__restrict__
p_a_block
,
const
FloatB
*
__restrict__
p_b_block
,
FloatC
*
__restrict__
p_c_thread
)
const
{
#if CK_USE_AMD_INLINE_ASM && CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
Run_amd_asm
(
p_a_block
,
p_b_block
,
p_c_thread
);
#else
Run_source
(
p_a_block
,
p_b_block
,
p_c_thread
);
#endif
}
template
<
class
BlockMatrixC
,
index_t
BlockMatrixStrideC
,
class
FloatC
>
__device__
void
CopyThreadMatrixCToBlockMatrixC
(
const
FloatC
*
__restrict__
p_c_thread
,
FloatC
*
__restrict__
p_c_block
)
const
...
...
driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
View file @
efd419ec
...
...
@@ -143,7 +143,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
constexpr
index_t
WeiBlockCopyDataPerRead_K
=
4
;
constexpr
index_t
OutThreadCopyDataPerWrite_N
=
2
;
#elif
0
#elif
1
// for 3x3, 34x34, v1r3, Pascal
// for 3x3, 28x28, v1r3, Pascal
// for 3x3, 14x14, v1r3, Pascal
...
...
@@ -478,9 +478,9 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
#elif
0
GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
#elif 0
GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
#elif 1
GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
#elif 0
GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
#endif
<
GridSize
,
...
...
driver/src/driver.cpp
View file @
efd419ec
...
...
@@ -85,7 +85,7 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif
0
#elif
1
// 3x3, 34x34
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
256
;
...
...
@@ -95,7 +95,7 @@ int main(int argc, char* argv[])
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
HPad
=
0
;
...
...
@@ -519,19 +519,19 @@ int main(int argc, char* argv[])
#if 0
device_convolution_direct_v2_nchw_kcyx_nkhw
(in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
#elif
1
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif 0
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif 0
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif 0
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif 0
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif 0
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
...
...
script/cmake-cuda.sh
View file @
efd419ec
...
...
@@ -13,11 +13,11 @@ cmake
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
-D
DEVICE_BACKEND
=
NVIDIA
\
-D
BOOST_ROOT
=
"/package/install/boost_1.67.0"
\
-D
CUDA_COMMON_INCLUDE_DIR
=
"/home/chao/code/test_feature/cuda_common/cuda_10.0_common/inc"
\
-D
CMAKE_CUDA_FLAGS
=
"-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -Xptxas -v -maxrregcount=128"
\
-D
CUDA_COMMON_INCLUDE_DIR
=
"/package/install/cuda/10.1/NVIDIA_CUDA-10.1_Samples/common/inc"
\
-D
CMAKE_CUDA_FLAGS
=
"-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61"
\
${
MY_PROJECT_SOURCE
}
#-D BOOST_ROOT="/package/install/boost_1.67.0" \
#-D CMAKE_CUDA_COMPILER="/package/install/cuda_10.0/bin/nvcc" \
#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61" \
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment