Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
1e347018
"driver/driver.cpp" did not exist on "33b5a8556b68c2cbeba555088b647310779d17e4"
Commit
1e347018
authored
Jan 31, 2021
by
Chao Liu
Browse files
N-D copy for threadwise copy v3
parent
b8d64385
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
226 additions
and
44 deletions
+226
-44
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+4
-0
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
...lude/tensor_description/dynamic_multi_index_transform.hpp
+1
-1
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
...kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
+2
-2
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
...or_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+214
-36
composable_kernel/include/utility/config.amd.hpp.in
composable_kernel/include/utility/config.amd.hpp.in
+1
-1
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+3
-3
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+1
-1
No files found.
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
1e347018
...
...
@@ -184,7 +184,11 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
GemmABlockTransferThreadSliceLengths_GemmK_GemmM
,
GemmABlockTransferThreadClusterLengths_GemmK_GemmM
,
Sequence
<
1
,
0
>
,
#if 1 // debug
Sequence
<
1
,
0
>
,
#else
Sequence
<
0
,
1
>
,
#endif
0
,
GemmABlockTransferSrcScalarPerVector_GemmK
,
GemmABlockTransferDstScalarPerVector_GemmM
,
...
...
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
View file @
1e347018
...
...
@@ -923,7 +923,7 @@ struct DynamicMerge
const
UpIdx
&
idx_up_new
,
Number
<
Hack
>
)
const
{
#if
0
#if
1
UpdateLowerIndex_1a
(
idx_diff_low
,
idx_diff_up
,
idx_low
,
idx_up_new
,
Number
<
Hack
>
{});
#elif 0
UpdateLowerIndex_1b
(
idx_diff_low
,
idx_diff_up
,
idx_low
,
idx_up_new
,
Number
<
Hack
>
{});
...
...
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
View file @
1e347018
...
...
@@ -269,7 +269,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
>
{}));
// hack to control index calculation when iterating over b_k_n_global tensor
#if
1
#if
0
// for padded input
constexpr auto b_k_n_global_iterator_hacks =
make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{},
...
...
@@ -279,7 +279,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
constexpr auto b_k_n_global_move_slice_window_iterator_hack =
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2>{};
#elif
0
#elif
1
// for non-padded input
constexpr
auto
b_k_n_global_iterator_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
1
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
1
>
{}),
...
...
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
View file @
1e347018
...
...
@@ -9,21 +9,21 @@ namespace ck {
// TODO: How to fix this? It uses an struct instead of lambda because lambda
// doesn't have constructor
template
<
index_t
Dst
VectorDim
,
index_t
Dst
ScalarPerVector
>
struct
lambda_
ThreadwiseDynamicTensorSliceTransfer_v1r3_dst_
scalar_per_access
template
<
index_t
VectorDim
,
index_t
ScalarPerVector
>
struct
lambda_scalar_per_access
{
__host__
__device__
constexpr
auto
operator
()(
index_t
i
)
const
{
return
(
i
==
Dst
VectorDim
)
?
Dst
ScalarPerVector
:
1
;
return
(
i
==
VectorDim
)
?
ScalarPerVector
:
1
;
}
};
template
<
index_t
Dst
VectorDim
>
struct
lambda_
ThreadwiseDynamicTensorSliceTransfer_v1r3_dst_
scalar_step_in_vector
template
<
index_t
VectorDim
>
struct
lambda_scalar_step_in_vector
{
__host__
__device__
constexpr
auto
operator
()(
index_t
i
)
const
{
return
(
i
==
Dst
VectorDim
)
?
1
:
0
;
return
(
i
==
VectorDim
)
?
1
:
0
;
}
};
...
...
@@ -80,16 +80,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
// TODO: don't use this
constexpr
auto
dst_scalar_per_access
=
generate_sequence
(
lambda_ThreadwiseDynamicTensorSliceTransfer_v1r3_dst_scalar_per_access
<
DstVectorDim
,
DstScalarPerVector
>
{},
Number
<
nDim
>
{});
lambda_scalar_per_access
<
DstVectorDim
,
DstScalarPerVector
>
{},
Number
<
nDim
>
{});
constexpr
auto
dst_scalar_step_in_vector
=
generate_sequence
(
lambda_ThreadwiseDynamicTensorSliceTransfer_v1r3_dst_scalar_step_in_vector
<
DstVectorDim
>
{},
Number
<
nDim
>
{});
constexpr
auto
dst_scalar_step_in_vector
=
generate_sequence
(
lambda_scalar_step_in_vector
<
DstVectorDim
>
{},
Number
<
nDim
>
{});
constexpr
auto
access_lengths
=
SliceLengths
{}
/
dst_scalar_per_access
;
...
...
@@ -341,8 +337,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
const
Index
&
src_slice_origin
,
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin
)
:
src_slice_origin_
(
make_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin
)),
dst_slice_origin_
(
make_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin
))
:
src_slice_origin_
coord_
(
make_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin
)),
dst_slice_origin_
coord_
(
make_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin
))
{
static_assert
(
SrcAddressSpace
==
AddressSpace
::
Global
or
SrcAddressSpace
==
AddressSpace
::
Lds
,
...
...
@@ -360,14 +356,15 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
__device__
void
SetSrcSliceOrigin
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_idx
)
{
src_slice_origin_
=
make_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_idx
);
src_slice_origin_
coord_
=
make_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_idx
);
}
__device__
void
SetDstSliceOrigin
(
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin_idx
)
{
dst_slice_origin_
=
make_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_idx
);
dst_slice_origin_
coord_
=
make_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_idx
);
}
#if 0
template <typename SrcIteratorHacks>
__device__ void RunRead(const SrcDesc& src_desc,
const SrcData* p_src,
...
...
@@ -447,10 +444,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
#if 1
src_vector.Vector() = amd_buffer_load<SrcData, SrcScalarPerVector>(
p_src
,
src_slice_origin_
.
GetOffset
(),
true
,
src_desc
.
GetElementSpaceSize
());
p_src, src_slice_origin_
coord_
.GetOffset(), true, src_desc.GetElementSpaceSize());
const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc
,
src_slice_origin_
);
src_desc, src_slice_origin_
coord_
);
src_vector.Vector() = is_valid ? src_vector.Vector() : SrcVectorType{0};
...
...
@@ -462,10 +459,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
});
#else
const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc
,
src_slice_origin_
);
src_desc, src_slice_origin_
coord_
);
src_vector.Vector() = amd_buffer_load<SrcData, SrcScalarPerVector>(
p_src
,
src_slice_origin_
.
GetOffset
(),
is_valid
,
src_desc
.
GetElementSpaceSize
());
p_src, src_slice_origin_
coord_
.GetOffset(), is_valid, src_desc.GetElementSpaceSize());
static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
constexpr index_t buffer_offset = buffer_desc_.CalculateOffset(
...
...
@@ -481,12 +478,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
if
constexpr
(
forward_dim1
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_forward_iterators
[
I1
]);
src_desc
,
src_slice_origin_
coord_
,
src_forward_iterators
[
I1
]);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_backward_iterators
[
I1
]);
src_desc
,
src_slice_origin_
coord_
,
src_backward_iterators
[
I1
]);
}
}
});
...
...
@@ -495,7 +492,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
if
constexpr
(
iter0
.
value
<
access_lengths
[
I0
]
-
1
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_forward_iterators
[
I0
]);
src_desc
,
src_slice_origin_
coord_
,
src_forward_iterators
[
I0
]);
}
});
...
...
@@ -505,9 +502,190 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
const
auto
src_reset_iterator
=
make_dynamic_tensor_coordinate_iterator
(
src_desc
,
GetSrcCoordinateResetStep
());
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_reset_iterator
);
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_coord_
,
src_reset_iterator
);
}
}
#else
template
<
typename
SrcIteratorHacks
>
__device__
void
RunRead
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
,
const
SrcIteratorHacks
&
src_iterator_hacks
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
// TODO: don't use this
constexpr
auto
src_scalar_per_access
=
generate_sequence
(
lambda_scalar_per_access
<
SrcVectorDim
,
SrcScalarPerVector
>
{},
Number
<
nDim
>
{});
constexpr
auto
src_scalar_step_in_vector
=
generate_sequence
(
lambda_scalar_step_in_vector
<
SrcVectorDim
>
{},
Number
<
nDim
>
{});
constexpr
auto
access_lengths
=
SliceLengths
{}
/
src_scalar_per_access
;
constexpr
auto
src_dim_access_order
=
SrcDimAccessOrder
{};
constexpr
auto
ordered_access_lengths
=
container_reorder_given_new2old
(
access_lengths
,
src_dim_access_order
);
// make forward iterators
const
auto
src_forward_iterators
=
generate_tuple
(
[
&
](
auto
i
)
{
Index
forward_step
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
forward_step
(
j
)
=
(
i
.
value
==
j
.
value
)
?
src_scalar_per_access
[
i
]
:
0
;
});
return
make_dynamic_tensor_coordinate_iterator
(
src_desc
,
forward_step
,
src_iterator_hacks
[
I0
][
i
]);
},
Number
<
nDim
>
{});
// make backward iterators
const
auto
src_backward_iterators
=
generate_tuple
(
[
&
](
auto
i
)
{
Index
backward_step
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
backward_step
(
j
)
=
(
i
.
value
==
j
.
value
)
?
-
src_scalar_per_access
[
i
]
:
0
;
});
return
make_dynamic_tensor_coordinate_iterator
(
src_desc
,
backward_step
,
src_iterator_hacks
[
I1
][
i
]);
},
Number
<
nDim
>
{});
// loop over tensor and copy
static_ford
<
decltype
(
ordered_access_lengths
)
>
{}([
&
](
auto
ordered_access_idx
)
{
// judge move forward or move backward
constexpr
auto
forward_sweep
=
[
&
]()
{
StaticallyIndexedArray
<
bool
,
nDim
>
forward_sweep
;
forward_sweep
(
I0
)
=
true
;
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_access_idx
[
I0
];
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_access_lengths
[
j
]
+
ordered_access_idx
[
j
];
});
forward_sweep
(
i
)
=
tmp
%
2
==
0
;
});
return
forward_sweep
;
}();
// calculate src data index
constexpr
auto
data_idx
=
[
&
]()
{
Index
ordered_idx
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
ordered_idx
(
i
)
=
forward_sweep
[
i
]
?
ordered_access_idx
[
i
]
:
ordered_access_lengths
[
i
]
-
1
-
ordered_access_idx
[
i
];
});
auto
data_idx
=
container_reorder_given_old2new
(
ordered_idx
,
src_dim_access_order
)
*
src_scalar_per_access
;
return
data_idx
;
}();
// copy data
// hardcoding for buffer_load
// TODO refactor transfer_data() to encapsulate this
static_assert
(
SrcAddressSpace
==
AddressSpace
::
Global
,
"wrong! hardcoded to use buffer_load, src must be global mem"
);
vector_type
<
SrcData
,
SrcScalarPerVector
>
src_vector
;
using
SrcVectorType
=
typename
vector_type
<
SrcData
,
SrcScalarPerVector
>::
MemoryType
;
#if 1
src_vector
.
Vector
()
=
amd_buffer_load
<
SrcData
,
SrcScalarPerVector
>
(
p_src
,
src_slice_origin_coord_
.
GetOffset
(),
true
,
src_desc
.
GetElementSpaceSize
());
const
bool
is_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_coord_
);
src_vector
.
Vector
()
=
is_valid
?
src_vector
.
Vector
()
:
SrcVectorType
{
0
};
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
data_idx
+
i
*
src_scalar_step_in_vector
);
buffer_
(
Number
<
buffer_offset
>
{})
=
src_vector
[
i
];
});
#else
const
bool
is_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_coord_
);
src_vector
.
Vector
()
=
amd_buffer_load
<
SrcData
,
SrcScalarPerVector
>
(
p_src
,
src_slice_origin_coord_
.
GetOffset
(),
is_valid
,
src_desc
.
GetElementSpaceSize
());
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
data_idx
+
i
*
src_scalar_step_in_vector
);
buffer_
(
Number
<
buffer_offset
>
{})
=
src_vector
[
i
];
});
#endif
constexpr
auto
move_on_dim
=
[
&
]()
constexpr
{
StaticallyIndexedArray
<
bool
,
nDim
>
move_on_dim
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
move_on_dim
(
i
)
=
ordered_access_idx
[
i
]
<
ordered_access_lengths
[
i
]
-
1
;
static_for
<
i
+
1
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
move_on_dim
(
i
)
&=
ordered_access_idx
[
j
]
==
ordered_access_lengths
[
j
]
-
1
;
});
});
return
move_on_dim
;
}
();
// move
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
move_on_dim
[
i
])
{
if
constexpr
(
forward_sweep
[
i
])
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_coord_
,
src_forward_iterators
[
src_dim_access_order
[
i
]]);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_coord_
,
src_backward_iterators
[
src_dim_access_order
[
i
]]);
}
}
});
});
// move src coordinate back to slice origin (or not)
if
constexpr
(
SrcResetCoordinateAfterRun
)
{
const
auto
src_reset_iterator
=
make_dynamic_tensor_coordinate_iterator
(
src_desc
,
GetSrcCoordinateResetStep
());
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_coord_
,
src_reset_iterator
);
}
}
#endif
__device__
void
RunWrite
(
const
DstDesc
&
dst_desc
,
DstData
*
p_dst
)
{
static_assert
(
remove_reference_t
<
DstDesc
>::
GetNumOfDimension
()
==
2
,
...
...
@@ -549,7 +727,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
DstInMemOp
==
InMemoryDataOperation
::
Set
,
"wrong! hardcoded for ds_write"
);
p_dst
[
dst_slice_origin_
.
GetOffset
()]
=
buffer_
[
Number
<
buffer_offset
>
{}];
p_dst
[
dst_slice_origin_
coord_
.
GetOffset
()]
=
buffer_
[
Number
<
buffer_offset
>
{}];
// move dim1 iterator
if
constexpr
(
iter1
.
value
<
Len1
-
1
)
...
...
@@ -557,12 +735,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
if
constexpr
(
forward_dim1
)
{
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_p
);
dst_desc
,
dst_slice_origin_
coord_
,
dst_step_0_p
);
}
else
{
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_m
);
dst_desc
,
dst_slice_origin_
coord_
,
dst_step_0_m
);
}
}
});
...
...
@@ -570,7 +748,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
// move dim0 iterator
if
constexpr
(
iter0
.
value
<
Len0
-
1
)
{
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_p_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
coord_
,
dst_step_p_0
);
}
});
}
...
...
@@ -581,7 +759,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
const
auto
dst_reset_iterator
=
make_dynamic_tensor_coordinate_iterator
(
dst_desc
,
GetDstCoordinateResetStep
());
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_reset_iterator
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
coord_
,
dst_reset_iterator
);
}
}
...
...
@@ -666,7 +844,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_iterator
(
src_desc
,
adjusted_step_idx
);
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
adjusted_step
);
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
coord_
,
adjusted_step
);
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
...
...
@@ -685,7 +863,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_iterator
(
src_desc
,
adjusted_step_idx
,
src_move_slice_window_iterator_hack
);
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
adjusted_step
);
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
coord_
,
adjusted_step
);
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
...
...
@@ -701,7 +879,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_iterator
(
dst_desc
,
adjusted_step_idx
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
adjusted_step
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
coord_
,
adjusted_step
);
}
private
:
...
...
@@ -712,8 +890,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
StaticallyIndexedArray
<
SrcData
,
buffer_size_
>
buffer_
;
SrcCoord
src_slice_origin_
;
DstCoord
dst_slice_origin_
;
SrcCoord
src_slice_origin_
coord_
;
DstCoord
dst_slice_origin_
coord_
;
}
;
}
// namespace ck
...
...
composable_kernel/include/utility/config.amd.hpp.in
View file @
1e347018
...
...
@@ -87,7 +87,7 @@
// thread-invariant, otherwise it's a bug
// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
#ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
1
#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
0
#endif
// workaround: put all workaround here
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
1e347018
...
...
@@ -144,7 +144,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
constexpr
index_t
GemmBBlockTransferSrcScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmN1
=
4
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmN1
=
1
;
#elif 0
// cdata = 64, BlockSize = 256, 128x128x8
// b thread copy 2x2
...
...
@@ -233,9 +233,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
constexpr
auto
conv_driver
=
#if
1
#if
0
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
#elif
0
#elif
1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
#elif 1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1
...
...
driver/src/conv_driver.cpp
View file @
1e347018
...
...
@@ -37,7 +37,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif
0
#elif
1
// 3x3, 35x35, stride 2
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
192
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment