Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
e6b32ffe
Commit
e6b32ffe
authored
Apr 27, 2022
by
wangshaojie6
Browse files
use fastest config
parent
ff6aacaf
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
5 additions
and
5 deletions
+5
-5
include/ck/config.hpp
include/ck/config.hpp
+1
-1
include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
...e/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+4
-4
No files found.
include/ck/config.hpp
View file @
e6b32ffe
...
@@ -107,7 +107,7 @@
...
@@ -107,7 +107,7 @@
// experimental feature: use __builtin_memcpy instead of pointer cast to access a vector from
// experimental feature: use __builtin_memcpy instead of pointer cast to access a vector from
// pointer of scalar
// pointer of scalar
#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
0
#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
1
// experimental feature: use __builtin_memcpy instead of union to do bit_cast
// experimental feature: use __builtin_memcpy instead of union to do bit_cast
#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1
#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
View file @
e6b32ffe
...
@@ -312,8 +312,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -312,8 +312,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
static
constexpr
auto
c_thread_desc_
=
make_naive_tensor_descriptor_packed
(
static
constexpr
auto
c_thread_desc_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MRepeat
>
{},
Number
<
NRepeat
>
{},
xdlops_gemm
.
GetRegSizePerXdlops
()));
make_tuple
(
Number
<
MRepeat
>
{},
Number
<
NRepeat
>
{},
xdlops_gemm
.
GetRegSizePerXdlops
()));
static
constexpr
index_t
A_K1_vec
=
A_K1
/
2
;
//
static constexpr index_t A_K1_vec = A_K1 / 2;
static
constexpr
index_t
B_K1_vec
=
B_K1
/
2
;
//
static constexpr index_t B_K1_vec = B_K1 / 2;
using
AThreadCopy
=
ThreadwiseTensorSliceTransfer_v4
<
FloatAB
,
using
AThreadCopy
=
ThreadwiseTensorSliceTransfer_v4
<
FloatAB
,
FloatAB
,
FloatAB
,
...
@@ -322,7 +322,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -322,7 +322,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
Sequence
<
1
,
1
,
1
,
KPerBlock
>
,
Sequence
<
1
,
1
,
1
,
KPerBlock
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
3
,
3
,
A_K1
_vec
,
A_K1
,
A_K1
>
;
A_K1
>
;
using
BThreadCopy
=
ThreadwiseTensorSliceTransfer_v4
<
FloatAB
,
using
BThreadCopy
=
ThreadwiseTensorSliceTransfer_v4
<
FloatAB
,
...
@@ -332,7 +332,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -332,7 +332,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
Sequence
<
1
,
1
,
1
,
KPerBlock
>
,
Sequence
<
1
,
1
,
1
,
KPerBlock
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
3
,
3
,
B_K1
_vec
,
B_K1
,
B_K1
>
;
B_K1
>
;
AThreadCopy
a_thread_copy_
{
CalculateAThreadOriginDataIndex
()};
AThreadCopy
a_thread_copy_
{
CalculateAThreadOriginDataIndex
()};
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment