Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b8d64385
Commit
b8d64385
authored
Jan 31, 2021
by
Chao Liu
Browse files
N-D tensor copy for threadwise copy v1r3
parent
39821a90
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
193 additions
and
208 deletions
+193
-208
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+6
-1
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
...kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
+1
-1
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
...or_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+158
-202
composable_kernel/include/utility/container_helper.hpp
composable_kernel/include/utility/container_helper.hpp
+24
-0
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+2
-2
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+2
-2
No files found.
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
b8d64385
...
@@ -388,6 +388,11 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
...
@@ -388,6 +388,11 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
const
index_t
InRightPadH
=
in_right_pads
[
I0
];
const
index_t
InRightPadH
=
in_right_pads
[
I0
];
const
index_t
InRightPadW
=
in_right_pads
[
I1
];
const
index_t
InRightPadW
=
in_right_pads
[
I1
];
if
(
!
(
InLeftPadH
==
0
&&
InLeftPadW
==
0
&&
InRightPadH
==
0
&&
InRightPadW
==
0
))
{
throw
std
::
runtime_error
(
"wrong! 1x1, stride 1, no padding"
);
}
// weight tensor
// weight tensor
#if 0
#if 0
// TODO implement graph optimization of tensor descriptor transformation
// TODO implement graph optimization of tensor descriptor transformation
...
...
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
View file @
b8d64385
...
@@ -269,7 +269,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
...
@@ -269,7 +269,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
>
{}));
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
>
{}));
// hack to control index calculation when iterating over b_k_n_global tensor
// hack to control index calculation when iterating over b_k_n_global tensor
#if
0
#if
1
// for padded input
// for padded input
constexpr
auto
b_k_n_global_iterator_hacks
=
constexpr
auto
b_k_n_global_iterator_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
>
{},
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
>
{},
...
...
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
View file @
b8d64385
...
@@ -7,9 +7,31 @@
...
@@ -7,9 +7,31 @@
namespace
ck
{
namespace
ck
{
// TODO: How to fix this? It uses an struct instead of lambda because lambda
// doesn't have constructor
template
<
index_t
DstVectorDim
,
index_t
DstScalarPerVector
>
struct
lambda_ThreadwiseDynamicTensorSliceTransfer_v1r3_dst_scalar_per_access
{
__host__
__device__
constexpr
auto
operator
()(
index_t
i
)
const
{
return
(
i
==
DstVectorDim
)
?
DstScalarPerVector
:
1
;
}
};
template
<
index_t
DstVectorDim
>
struct
lambda_ThreadwiseDynamicTensorSliceTransfer_v1r3_dst_scalar_step_in_vector
{
__host__
__device__
constexpr
auto
operator
()(
index_t
i
)
const
{
return
(
i
==
DstVectorDim
)
?
1
:
0
;
}
};
// this version is less likely to have scratch memory issue, due to:
// this version is less likely to have scratch memory issue, due to:
// 1. It does not keep reference to tensor descriptor
// 1. It does not keep reference to tensor descriptor
// 2. It does not construct new tensor coordinate for this->Run()
// 2. It does not construct new tensor coordinate for this->Run()
// Assume src_slice_origin_idx is 0
// TODO: support non-zero src_slice_oring_idx
template
<
typename
SrcData
,
template
<
typename
SrcData
,
typename
DstData
,
typename
DstData
,
typename
SrcDesc
,
typename
SrcDesc
,
...
@@ -57,153 +79,111 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
...
@@ -57,153 +79,111 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
// hardcoded for 4D
constexpr
auto
dst_scalar_per_access
=
generate_sequence
(
// TODO implemente N-D
lambda_ThreadwiseDynamicTensorSliceTransfer_v1r3_dst_scalar_per_access
<
static_assert
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
4
,
DstVectorDim
,
"wrong! hardcoded for 4D tensor"
);
DstScalarPerVector
>
{},
Number
<
nDim
>
{});
constexpr
auto
dst_scalar_per_access
=
[
&
]()
{
constexpr
auto
dst_scalar_step_in_vector
=
generate_sequence
(
Index
dst_scalar_per_access
;
lambda_ThreadwiseDynamicTensorSliceTransfer_v1r3_dst_scalar_step_in_vector
<
DstVectorDim
>
{},
Number
<
nDim
>
{});
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
access_lengths
=
SliceLengths
{}
/
dst_scalar_per_access
;
dst_scalar_per_access
(
i
)
=
(
i
==
DstVectorDim
)
?
DstScalarPerVector
:
1
;
constexpr
auto
dim_access_order
=
DimAccessOrder
{};
constexpr
auto
ordered_access_lengths
=
container_reorder_given_new2old
(
access_lengths
,
dim_access_order
);
// make forward iterators
const
auto
dst_forward_iterators
=
generate_tuple
(
[
&
](
auto
i
)
{
Index
forward_step
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
forward_step
(
j
)
=
(
i
.
value
==
j
.
value
)
?
dst_scalar_per_access
[
i
]
:
0
;
});
});
return
dst_scalar_per_access
;
return
make_dynamic_tensor_coordinate_iterator
(
}();
dst_desc
,
forward_step
,
dst_iterator_hacks
[
I0
][
i
]);
},
Number
<
nDim
>
{});
constexpr
auto
dst_scalar_step_in_vector
=
[
&
]()
{
// make backward iterators
Index
dst_scalar_step_in_vector
;
const
auto
dst_backward_iterators
=
generate_tuple
(
[
&
](
auto
i
)
{
Index
backward_step
;
static_for
<
0
,
nDim
,
1
>
{}(
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
[
&
](
auto
i
)
{
dst_scalar_step_in_vector
(
i
)
=
(
i
==
DstVectorDim
)
?
1
:
0
;
});
backward_step
(
j
)
=
(
i
.
value
==
j
.
value
)
?
-
dst_scalar_per_access
[
i
]
:
0
;
});
return
dst_scalar_step_in_vector
;
return
make_dynamic_tensor_coordinate_iterator
(
}();
dst_desc
,
backward_step
,
dst_iterator_hacks
[
I1
][
i
]);
},
Number
<
nDim
>
{});
constexpr
auto
access_lengths
=
[
&
]()
{
// loop over tensor and copy
Index
access_lengths
;
static_ford
<
decltype
(
ordered_access_lengths
)
>
{}([
&
](
auto
ordered_access_idx
)
{
static_for
<
0
,
nDim
,
1
>
{}(
// judge move forward or move backward
[
&
](
auto
i
)
{
access_lengths
(
i
)
=
SliceLengths
{}[
i
]
/
dst_scalar_per_access
[
i
];
});
constexpr
auto
forward_sweep
=
[
&
]()
{
StaticallyIndexedArray
<
bool
,
nDim
>
forward_sweep
;
return
access_lengths
;
forward_sweep
(
I0
)
=
true
;
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_access_idx
[
I0
];
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_access_lengths
[
j
]
+
ordered_access_idx
[
j
];
});
forward_sweep
(
i
)
=
tmp
%
2
==
0
;
});
return
forward_sweep
;
}();
}();
const
auto
dst_forward_iterators
=
// calculate dst data index
make_tuple
(
make_dynamic_tensor_coordinate_iterator
(
dst_desc
,
constexpr
auto
dst_data_idx
=
[
&
]()
{
make_multi_index
(
1
,
0
,
0
,
0
)
*
Index
ordered_idx
;
dst_scalar_per_access
,
dst_iterator_hacks
[
I0
][
I0
]),
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
make_dynamic_tensor_coordinate_iterator
(
dst_desc
,
ordered_idx
(
i
)
=
forward_sweep
[
i
]
make_multi_index
(
0
,
1
,
0
,
0
)
*
?
ordered_access_idx
[
i
]
dst_scalar_per_access
,
:
ordered_access_lengths
[
i
]
-
1
-
ordered_access_idx
[
i
];
dst_iterator_hacks
[
I0
][
I1
]),
});
make_dynamic_tensor_coordinate_iterator
(
dst_desc
,
make_multi_index
(
0
,
0
,
1
,
0
)
*
dst_scalar_per_access
,
dst_iterator_hacks
[
I0
][
I2
]),
make_dynamic_tensor_coordinate_iterator
(
dst_desc
,
make_multi_index
(
0
,
0
,
0
,
1
)
*
dst_scalar_per_access
,
dst_iterator_hacks
[
I0
][
I3
]));
const
auto
dst_backward_iterators
=
make_tuple
(
make_dynamic_tensor_coordinate_iterator
(
dst_desc
,
make_multi_index
(
-
1
,
0
,
0
,
0
)
*
dst_scalar_per_access
,
dst_iterator_hacks
[
I1
][
I0
]),
make_dynamic_tensor_coordinate_iterator
(
dst_desc
,
make_multi_index
(
0
,
-
1
,
0
,
0
)
*
dst_scalar_per_access
,
dst_iterator_hacks
[
I1
][
I1
]),
make_dynamic_tensor_coordinate_iterator
(
dst_desc
,
make_multi_index
(
0
,
0
,
-
1
,
0
)
*
dst_scalar_per_access
,
dst_iterator_hacks
[
I1
][
I2
]),
make_dynamic_tensor_coordinate_iterator
(
dst_desc
,
make_multi_index
(
0
,
0
,
0
,
-
1
)
*
dst_scalar_per_access
,
dst_iterator_hacks
[
I1
][
I3
]));
// loop over dim0
static_for
<
0
,
SliceLengths
{}[
DimAccessOrder
{}[
I0
]],
dst_scalar_per_access
[
DimAccessOrder
{}[
I0
]]
>
{}([
&
](
auto
iter0
)
{
constexpr
index_t
i0
=
iter0
;
constexpr
bool
forward_dim1
=
auto
dst_data_idx
=
container_reorder_given_old2new
(
ordered_idx
,
dim_access_order
)
*
(
iter0
/
dst_scalar_per_access
[
DimAccessOrder
{}[
I0
]])
%
2
==
0
;
dst_scalar_per_access
;
// loop over dim1
return
dst_data_idx
;
static_for
<
0
,
}();
SliceLengths
{}[
DimAccessOrder
{}[
I1
]],
dst_scalar_per_access
[
DimAccessOrder
{}[
I1
]]
>
{}([
&
](
auto
iter1
)
{
constexpr
index_t
i1
=
forward_dim1
?
iter1
:
SliceLengths
{}[
DimAccessOrder
{}[
I1
]]
-
dst_scalar_per_access
[
DimAccessOrder
{}[
I1
]]
-
iter1
;
constexpr
bool
forward_dim2
=
((
iter0
/
dst_scalar_per_access
[
DimAccessOrder
{}[
I0
]])
*
access_lengths
[
DimAccessOrder
{}[
I1
]]
+
(
iter1
/
dst_scalar_per_access
[
DimAccessOrder
{}[
I1
]]))
%
2
==
0
;
// loop over dim2
static_for
<
0
,
SliceLengths
{}[
DimAccessOrder
{}[
I2
]],
dst_scalar_per_access
[
DimAccessOrder
{}[
I2
]]
>
{}([
&
](
auto
iter2
)
{
constexpr
index_t
i2
=
forward_dim2
?
iter2
:
SliceLengths
{}[
DimAccessOrder
{}[
I2
]]
-
dst_scalar_per_access
[
DimAccessOrder
{}[
I2
]]
-
iter2
;
constexpr
bool
forward_dim3
=
(((
iter0
/
dst_scalar_per_access
[
DimAccessOrder
{}[
I0
]])
*
access_lengths
[
DimAccessOrder
{}[
I1
]]
+
(
iter1
/
dst_scalar_per_access
[
DimAccessOrder
{}[
I1
]]))
*
access_lengths
[
DimAccessOrder
{}[
I2
]]
+
(
iter2
/
dst_scalar_per_access
[
DimAccessOrder
{}[
I2
]]))
%
2
==
0
;
// loop over dim3
static_for
<
0
,
SliceLengths
{}[
DimAccessOrder
{}[
I3
]],
dst_scalar_per_access
[
DimAccessOrder
{}[
I3
]]
>
{}([
&
](
auto
iter3
)
{
constexpr
index_t
i3
=
forward_dim3
?
iter3
:
SliceLengths
{}[
DimAccessOrder
{}[
I3
]]
-
dst_scalar_per_access
[
DimAccessOrder
{}[
I3
]]
-
iter3
;
//
do work
//
copy data
// hardcoding for buffer_store
// hardcoding for buffer_store
// TODO refactor transfer_data() to encapsulate this
// TODO refactor transfer_data() to encapsulate this
static_assert
(
SrcAddressSpace
==
AddressSpace
::
Vgpr
&&
static_assert
(
SrcAddressSpace
==
AddressSpace
::
Vgpr
&&
DstAddressSpace
==
AddressSpace
::
Global
,
DstAddressSpace
==
AddressSpace
::
Global
,
"wrong! hardcoded to use buffer_store"
);
"wrong! hardcoded to use buffer_store"
);
using
DstVectorType
=
typename
vector_type
<
DstData
,
DstScalarPerVector
>::
MemoryType
;
vector_type
<
DstData
,
DstScalarPerVector
>
dst_vector
;
vector_type
<
DstData
,
DstScalarPerVector
>
dst_vector
;
// this is hardcoded for src that has compile-time tensor descriptor
// this is hardcoded for src that has compile-time tensor descriptor
static_for
<
0
,
DstScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
DstScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
// hack: assume src_slice_origin_idx is 0
// assume src_slice_origin_idx is 0
constexpr
index_t
src_offset
=
SrcDesc
::
CalculateOffset
(
// TODO: support non-zero src_slice_oring_idx
container_reorder_given_old2new
(
make_multi_index
(
i0
,
i1
,
i2
,
i3
),
constexpr
index_t
src_offset
=
DimAccessOrder
{})
+
SrcDesc
::
CalculateOffset
(
dst_data_idx
+
i
*
dst_scalar_step_in_vector
);
i
*
dst_scalar_step_in_vector
);
dst_vector
(
i
)
=
p_src
[
Number
<
src_offset
>
{}];
dst_vector
(
i
)
=
p_src
[
Number
<
src_offset
>
{}];
});
});
#if 1
amd_buffer_store_v2
<
DstData
,
DstScalarPerVector
>
(
amd_buffer_store_v2
<
DstData
,
DstScalarPerVector
>
(
dst_vector
.
Vector
(),
dst_vector
.
Vector
(),
p_dst
,
p_dst
,
...
@@ -211,76 +191,52 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
...
@@ -211,76 +191,52 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_slice_origin_coord_
),
dst_desc
,
dst_slice_origin_coord_
),
dst_desc
.
GetElementSpaceSize
());
dst_desc
.
GetElementSpaceSize
());
#else
static_for
<
0
,
DstScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
amd_buffer_store_v2
<
DstData
,
1
>
(
dst_vector
[
i
],
p_dst
,
dst_slice_origin_coord_
.
GetOffset
()
+
i
.
value
,
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_slice_origin_coord_
),
dst_desc
.
GetElementSpaceSize
());
});
#endif
// move along dim3
constexpr
auto
move_on_dim
=
[
&
]()
constexpr
if
constexpr
(
iter3
<
SliceLengths
{}[
DimAccessOrder
{}[
I3
]]
-
dst_scalar_per_access
[
DimAccessOrder
{}[
I3
]])
{
if
constexpr
(
forward_dim3
)
{
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_coord_
,
dst_forward_iterators
[
DimAccessOrder
{}[
I3
]]);
}
else
{
{
move_dynamic_tensor_coordinate
(
StaticallyIndexedArray
<
bool
,
nDim
>
move_on_dim
;
dst_desc
,
dst_slice_origin_coord_
,
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
dst_backward_iterators
[
DimAccessOrder
{}[
I3
]]);
move_on_dim
(
i
)
=
ordered_access_idx
[
i
]
<
ordered_access_lengths
[
i
]
-
1
;
}
}
static_for
<
i
+
1
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
move_on_dim
(
i
)
&=
ordered_access_idx
[
j
]
==
ordered_access_lengths
[
j
]
-
1
;
});
});
});
// move along dim2
return
move_on_dim
;
if
constexpr
(
iter2
<
SliceLengths
{}[
DimAccessOrder
{}[
I2
]]
-
dst_scalar_per_access
[
DimAccessOrder
{}[
I2
]])
{
if
constexpr
(
forward_dim2
)
{
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_coord_
,
dst_forward_iterators
[
DimAccessOrder
{}[
I2
]]);
}
else
{
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_coord_
,
dst_backward_iterators
[
DimAccessOrder
{}[
I2
]]);
}
}
}
}
);
(
);
// move
along dim1
// move
if
constexpr
(
iter1
<
SliceLengths
{}[
DimAccessOrder
{}[
I1
]]
-
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
dst_scalar_per_access
[
DimAccessOrder
{}[
I1
]
])
if
constexpr
(
move_on_dim
[
i
])
{
{
if
constexpr
(
forward_
dim1
)
if
constexpr
(
forward_
sweep
[
i
]
)
{
{
move_dynamic_tensor_coordinate
(
dst_desc
,
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_coord_
,
dst_slice_origin_coord_
,
dst_forward_iterators
[
D
im
A
ccess
O
rder
{}[
I1
]]);
dst_forward_iterators
[
d
im
_a
ccess
_o
rder
[
i
]]);
}
}
else
else
{
{
move_dynamic_tensor_coordinate
(
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_desc
,
dst_slice_origin_coord_
,
dst_slice_origin_coord_
,
dst_backward_iterators
[
D
im
A
ccess
O
rder
{}[
I1
]]);
dst_backward_iterators
[
d
im
_a
ccess
_o
rder
[
i
]]);
}
}
}
}
});
});
// move along dim0
if
constexpr
(
iter0
<
SliceLengths
{}[
DimAccessOrder
{}[
I0
]]
-
dst_scalar_per_access
[
DimAccessOrder
{}[
I0
]])
{
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_coord_
,
dst_forward_iterators
[
DimAccessOrder
{}[
I0
]]);
}
});
});
// move dst coordinate back to slice origin (or not)
// move dst coordinate back to slice origin (or not)
...
@@ -340,7 +296,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
...
@@ -340,7 +296,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
private:
private:
DstCoord
dst_slice_origin_coord_
;
DstCoord
dst_slice_origin_coord_
;
};
};
// namespace ck
// this version does following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
// this version does following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
// and sometimes useless instructions
// and sometimes useless instructions
...
...
composable_kernel/include/utility/container_helper.hpp
View file @
b8d64385
...
@@ -73,6 +73,30 @@ __host__ __device__ constexpr auto container_reorder_given_old2new(const Tuple<T
...
@@ -73,6 +73,30 @@ __host__ __device__ constexpr auto container_reorder_given_old2new(const Tuple<T
old_tuple
,
typename
sequence_map_inverse
<
decltype
(
old2new
)
>::
type
{});
old_tuple
,
typename
sequence_map_inverse
<
decltype
(
old2new
)
>::
type
{});
}
}
template
<
index_t
...
Is
,
index_t
...
IRs
>
__host__
__device__
constexpr
auto
container_reorder_given_new2old
(
Sequence
<
Is
...
>
/* old_seq */
,
Sequence
<
IRs
...
>
/*new2old*/
)
{
static_assert
(
sizeof
...(
Is
)
==
sizeof
...(
IRs
),
"wrong! size not consistent"
);
static_assert
(
is_valid_sequence_map
<
Sequence
<
IRs
...
>>
{},
"wrong! invalid reorder map"
);
return
Sequence
<
Sequence
<
Is
...
>::
At
(
Number
<
IRs
>
{})...
>
{};
}
template
<
index_t
...
Is
,
index_t
...
IRs
>
__host__
__device__
constexpr
auto
container_reorder_given_old2new
(
Sequence
<
Is
...
>
old_seq
,
Sequence
<
IRs
...
>
/* old2new */
)
{
static_assert
(
sizeof
...(
Is
)
==
sizeof
...(
IRs
),
"wrong! size not consistent"
);
static_assert
(
is_valid_sequence_map
<
Sequence
<
IRs
...
>>
{},
"wrong! invalid reorder map"
);
constexpr
auto
new2old
=
typename
sequence_map_inverse
<
Sequence
<
IRs
...
>>::
type
{};
return
container_reorder_give_new2old
(
old_seq
,
new2old
);
}
template
<
typename
TData
,
typename
Container
,
typename
Reduce
>
template
<
typename
TData
,
typename
Container
,
typename
Reduce
>
__host__
__device__
constexpr
TData
container_reduce
(
const
Container
&
a
,
Reduce
f
,
TData
init
)
__host__
__device__
constexpr
TData
container_reduce
(
const
Container
&
a
,
Reduce
f
,
TData
init
)
{
{
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
b8d64385
...
@@ -144,7 +144,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
...
@@ -144,7 +144,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
constexpr
index_t
GemmBBlockTransferSrcScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmBBlockTransferSrcScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmN1
=
1
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmN1
=
4
;
#elif 0
#elif 0
// cdata = 64, BlockSize = 256, 128x128x8
// cdata = 64, BlockSize = 256, 128x128x8
// b thread copy 2x2
// b thread copy 2x2
...
@@ -233,7 +233,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
...
@@ -233,7 +233,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
constexpr
auto
conv_driver
=
constexpr
auto
conv_driver
=
#if
0
#if
1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
#elif 0
#elif 0
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
...
...
driver/src/conv_driver.cpp
View file @
b8d64385
...
@@ -52,7 +52,7 @@ int main(int argc, char* argv[])
...
@@ -52,7 +52,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
0
#elif
1
// 3x3, 71x71
// 3x3, 71x71
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
192
;
constexpr
index_t
C
=
192
;
...
@@ -592,7 +592,7 @@ int main(int argc, char* argv[])
...
@@ -592,7 +592,7 @@ int main(int argc, char* argv[])
LeftPads{},
LeftPads{},
RightPads{},
RightPads{},
nrepeat);
nrepeat);
#elif
1
#elif
0
device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
in_nchw_desc
,
device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment