Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
8bf95038
Commit
8bf95038
authored
Mar 22, 2021
by
Chao Liu
Browse files
refactor
parent
8b5e63ed
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
38 additions
and
37 deletions
+38
-37
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+25
-25
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
...convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
+6
-6
composable_kernel/include/utility/config.amd.hpp.in
composable_kernel/include/utility/config.amd.hpp.in
+7
-6
No files found.
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
8bf95038
...
@@ -232,7 +232,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
...
@@ -232,7 +232,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
const
bool
has_double_tail_k_block_loop
=
(
GemmK
/
GemmKPerBlock
)
%
2
==
0
;
const
bool
has_double_tail_k_block_loop
=
(
GemmK
/
GemmKPerBlock
)
%
2
==
0
;
#if
1 // pass tensor descriptors by value
#if
CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
index_t
nrepeat
=
100
;
index_t
nrepeat
=
100
;
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
...
@@ -370,7 +370,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
...
@@ -370,7 +370,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
<<
std
::
endl
;
}
}
#elif
1 // pass tensor descriptors by pointers
#elif
CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER
using
ADesc
=
decltype
(
wei_gemmk_gemmm_global_desc
);
using
ADesc
=
decltype
(
wei_gemmk_gemmm_global_desc
);
using
BDesc
=
decltype
(
in_gemmk_gemmn_global_desc
);
using
BDesc
=
decltype
(
in_gemmk_gemmn_global_desc
);
using
CDesc
=
decltype
(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
);
using
CDesc
=
decltype
(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
);
...
@@ -399,12 +399,12 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
...
@@ -399,12 +399,12 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
{
{
const
auto
kernel
=
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
run_gridwise_operation
<
gridwise_gemm
,
decltype
(
wei_gemmk_gemmm_global_desc
),
const
decltype
(
wei_gemmk_gemmm_global_desc
)
*
,
const
FloatAB
*
,
const
FloatAB
*
,
decltype
(
in_gemmk_gemmn_global_desc
),
const
decltype
(
in_gemmk_gemmn_global_desc
)
*
,
const
FloatAB
*
,
const
FloatAB
*
,
decltype
(
const
decltype
(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
),
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
)
*
,
FloatC
*
,
FloatC
*
,
integral_constant
<
bool
,
true
>
,
integral_constant
<
bool
,
true
>
,
integral_constant
<
bool
,
true
>>
;
integral_constant
<
bool
,
true
>>
;
...
@@ -431,12 +431,12 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
...
@@ -431,12 +431,12 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
{
{
const
auto
kernel
=
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
run_gridwise_operation
<
gridwise_gemm
,
decltype
(
wei_gemmk_gemmm_global_desc
),
const
decltype
(
wei_gemmk_gemmm_global_desc
)
*
,
const
FloatAB
*
,
const
FloatAB
*
,
decltype
(
in_gemmk_gemmn_global_desc
),
const
decltype
(
in_gemmk_gemmn_global_desc
)
*
,
const
FloatAB
*
,
const
FloatAB
*
,
decltype
(
const
decltype
(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
),
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
)
*
,
FloatC
*
,
FloatC
*
,
integral_constant
<
bool
,
true
>
,
integral_constant
<
bool
,
true
>
,
integral_constant
<
bool
,
false
>>
;
integral_constant
<
bool
,
false
>>
;
...
@@ -463,12 +463,12 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
...
@@ -463,12 +463,12 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
{
{
const
auto
kernel
=
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
run_gridwise_operation
<
gridwise_gemm
,
decltype
(
wei_gemmk_gemmm_global_desc
),
const
decltype
(
wei_gemmk_gemmm_global_desc
)
*
,
const
FloatAB
*
,
const
FloatAB
*
,
decltype
(
in_gemmk_gemmn_global_desc
),
const
decltype
(
in_gemmk_gemmn_global_desc
)
*
,
const
FloatAB
*
,
const
FloatAB
*
,
decltype
(
const
decltype
(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
),
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
)
*
,
FloatC
*
,
FloatC
*
,
integral_constant
<
bool
,
false
>
,
integral_constant
<
bool
,
false
>
,
integral_constant
<
bool
,
true
>>
;
integral_constant
<
bool
,
true
>>
;
...
@@ -495,12 +495,12 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
...
@@ -495,12 +495,12 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
{
{
const
auto
kernel
=
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
run_gridwise_operation
<
gridwise_gemm
,
decltype
(
wei_gemmk_gemmm_global_desc
),
const
decltype
(
wei_gemmk_gemmm_global_desc
)
*
,
const
FloatAB
*
,
const
FloatAB
*
,
decltype
(
in_gemmk_gemmn_global_desc
),
const
decltype
(
in_gemmk_gemmn_global_desc
)
*
,
const
FloatAB
*
,
const
FloatAB
*
,
decltype
(
const
decltype
(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
),
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
)
*
,
FloatC
*
,
FloatC
*
,
integral_constant
<
bool
,
false
>
,
integral_constant
<
bool
,
false
>
,
integral_constant
<
bool
,
false
>>
;
integral_constant
<
bool
,
false
>>
;
...
@@ -537,7 +537,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
...
@@ -537,7 +537,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
<<
std
::
endl
;
}
}
#elif
1 // pass tensor descriptor by void*
#elif
CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
using
ADesc
=
decltype
(
wei_gemmk_gemmm_global_desc
);
using
ADesc
=
decltype
(
wei_gemmk_gemmm_global_desc
);
using
BDesc
=
decltype
(
in_gemmk_gemmn_global_desc
);
using
BDesc
=
decltype
(
in_gemmk_gemmn_global_desc
);
using
CDesc
=
decltype
(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
);
using
CDesc
=
decltype
(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
);
...
@@ -905,7 +905,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
...
@@ -905,7 +905,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
const
bool
has_double_tail_k_block_loop
=
(
GemmK
/
GemmKPerBlock
)
%
2
==
0
;
const
bool
has_double_tail_k_block_loop
=
(
GemmK
/
GemmKPerBlock
)
%
2
==
0
;
#if
1 // pass tensor descriptors by value
#if
CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
index_t
nrepeat
=
100
;
index_t
nrepeat
=
100
;
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
...
@@ -1043,7 +1043,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
...
@@ -1043,7 +1043,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
<<
std
::
endl
;
}
}
#elif
1 // pass tensor descriptors by pointers
#elif
CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER
using
ADesc
=
decltype
(
wei_gemmk_gemmm_global_desc
);
using
ADesc
=
decltype
(
wei_gemmk_gemmm_global_desc
);
using
BDesc
=
decltype
(
in_gemmk_gemmn_global_desc
);
using
BDesc
=
decltype
(
in_gemmk_gemmn_global_desc
);
using
CDesc
=
decltype
(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
);
using
CDesc
=
decltype
(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
);
...
@@ -1210,7 +1210,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
...
@@ -1210,7 +1210,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
<<
std
::
endl
;
}
}
#elif
1 // pass tensor descriptor by void*
#elif
CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
using
ADesc
=
decltype
(
wei_gemmk_gemmm_global_desc
);
using
ADesc
=
decltype
(
wei_gemmk_gemmm_global_desc
);
using
BDesc
=
decltype
(
in_gemmk_gemmn_global_desc
);
using
BDesc
=
decltype
(
in_gemmk_gemmn_global_desc
);
using
CDesc
=
decltype
(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
);
using
CDesc
=
decltype
(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
);
...
@@ -1564,7 +1564,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1
...
@@ -1564,7 +1564,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1
const
bool
has_double_tail_k_block_loop
=
(
GemmK
/
GemmKPerBlock
)
%
2
==
0
;
const
bool
has_double_tail_k_block_loop
=
(
GemmK
/
GemmKPerBlock
)
%
2
==
0
;
#if
1 // pass tensor descriptors by value
#if
CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
index_t
nrepeat
=
100
;
index_t
nrepeat
=
100
;
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
...
@@ -1702,7 +1702,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1
...
@@ -1702,7 +1702,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
<<
std
::
endl
;
}
}
#elif
1 // pass tensor descriptors by pointers
#elif
CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER
using
ADesc
=
decltype
(
wei_gemmk_gemmm_global_desc
);
using
ADesc
=
decltype
(
wei_gemmk_gemmm_global_desc
);
using
BDesc
=
decltype
(
in_gemmk_gemmn_global_desc
);
using
BDesc
=
decltype
(
in_gemmk_gemmn_global_desc
);
using
CDesc
=
decltype
(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
);
using
CDesc
=
decltype
(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
);
...
@@ -1869,7 +1869,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1
...
@@ -1869,7 +1869,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
<<
std
::
endl
;
}
}
#elif
1 // pass tensor descriptor by void*
#elif
CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
using
ADesc
=
decltype
(
wei_gemmk_gemmm_global_desc
);
using
ADesc
=
decltype
(
wei_gemmk_gemmm_global_desc
);
using
BDesc
=
decltype
(
in_gemmk_gemmn_global_desc
);
using
BDesc
=
decltype
(
in_gemmk_gemmn_global_desc
);
using
CDesc
=
decltype
(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
);
using
CDesc
=
decltype
(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
);
...
...
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
View file @
8bf95038
...
@@ -233,7 +233,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_pad
...
@@ -233,7 +233,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_pad
printf
(
"%s: BlockSize %d, GridSize %d
\n
"
,
__func__
,
BlockSize
,
GridSize
);
printf
(
"%s: BlockSize %d, GridSize %d
\n
"
,
__func__
,
BlockSize
,
GridSize
);
#if
1 // pass tensor descriptors by value
#if
CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
index_t
nrepeat
=
100
;
index_t
nrepeat
=
100
;
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
...
@@ -369,7 +369,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_pad
...
@@ -369,7 +369,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_pad
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
<<
std
::
endl
;
}
}
#elif
1 // pass tensor descriptors by pointers
#elif
CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER
using
ADesc
=
decltype
(
wei_gemmk_gemmm_global_desc
);
using
ADesc
=
decltype
(
wei_gemmk_gemmm_global_desc
);
using
BDesc
=
decltype
(
in_gemmk_gemmn_global_desc
);
using
BDesc
=
decltype
(
in_gemmk_gemmn_global_desc
);
using
CDesc
=
decltype
(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
);
using
CDesc
=
decltype
(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
);
...
@@ -534,7 +534,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_pad
...
@@ -534,7 +534,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_pad
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
<<
std
::
endl
;
}
}
#elif
1 // pass tensor descriptor by void*
#elif
CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
using
ADesc
=
decltype
(
wei_gemmk_gemmm_global_desc
);
using
ADesc
=
decltype
(
wei_gemmk_gemmm_global_desc
);
using
BDesc
=
decltype
(
in_gemmk_gemmn_global_desc
);
using
BDesc
=
decltype
(
in_gemmk_gemmn_global_desc
);
using
CDesc
=
decltype
(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
);
using
CDesc
=
decltype
(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
);
...
@@ -887,7 +887,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_1x1
...
@@ -887,7 +887,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_1x1
printf
(
"%s: BlockSize %d, GridSize %d
\n
"
,
__func__
,
BlockSize
,
GridSize
);
printf
(
"%s: BlockSize %d, GridSize %d
\n
"
,
__func__
,
BlockSize
,
GridSize
);
#if
1 // pass tensor descriptors by value
#if
CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
index_t
nrepeat
=
100
;
index_t
nrepeat
=
100
;
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
...
@@ -1023,7 +1023,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_1x1
...
@@ -1023,7 +1023,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_1x1
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
<<
std
::
endl
;
}
}
#elif
1 // pass tensor descriptors by pointers
#elif
CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER
using
ADesc
=
decltype
(
wei_gemmk_gemmm_global_desc
);
using
ADesc
=
decltype
(
wei_gemmk_gemmm_global_desc
);
using
BDesc
=
decltype
(
in_gemmk_gemmn_global_desc
);
using
BDesc
=
decltype
(
in_gemmk_gemmn_global_desc
);
using
CDesc
=
decltype
(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
);
using
CDesc
=
decltype
(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
);
...
@@ -1188,7 +1188,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_1x1
...
@@ -1188,7 +1188,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_1x1
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
<<
std
::
endl
;
}
}
#elif
1 // pass tensor descriptor by void*
#elif
CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
using
ADesc
=
decltype
(
wei_gemmk_gemmm_global_desc
);
using
ADesc
=
decltype
(
wei_gemmk_gemmm_global_desc
);
using
BDesc
=
decltype
(
in_gemmk_gemmn_global_desc
);
using
BDesc
=
decltype
(
in_gemmk_gemmn_global_desc
);
using
CDesc
=
decltype
(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
);
using
CDesc
=
decltype
(
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
);
...
...
composable_kernel/include/utility/config.amd.hpp.in
View file @
8bf95038
...
@@ -11,13 +11,9 @@
...
@@ -11,13 +11,9 @@
#define CK_DEVICE_BACKEND_AMD 1
#define CK_DEVICE_BACKEND_AMD 1
// GPU ID
// GPU ID
#if 0
#define CK_AMD_GPU_GFX906 0
#define CK_AMD_GPU_GFX906 1
#define CK_AMD_GPU_GFX908 0
#elif 0
#define CK_AMD_GPU_GFX908 1
#else
#define CK_AMD_GPU_GFX1030 1
#define CK_AMD_GPU_GFX1030 1
#endif
// HIP version
// HIP version
#ifndef CK_HIP_VERSION_FLAT
#ifndef CK_HIP_VERSION_FLAT
...
@@ -108,6 +104,11 @@
...
@@ -108,6 +104,11 @@
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0
#endif
#endif
// pass tensor descriptor by value, pointer or void*
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER 0
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
// thread-invariant, otherwise it's a bug
// thread-invariant, otherwise it's a bug
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment