Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
d3405258
Commit
d3405258
authored
Jul 29, 2020
by
Chao Liu
Browse files
prototype dynamic descriptor
parent
834eb24c
Changes
12
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
1172 additions
and
51 deletions
+1172
-51
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
...rnel/include/kernel_algorithm/dummy_dynamic_transform.hpp
+502
-0
composable_kernel/include/kernel_algorithm/dummy_static_transform.hpp
...ernel/include/kernel_algorithm/dummy_static_transform.hpp
+19
-24
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
...lude/tensor_description/dynamic_multi_index_transform.hpp
+223
-0
composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
.../include/tensor_description/dynamic_tensor_descriptor.hpp
+237
-0
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
...e/tensor_description/dynamic_tensor_descriptor_helper.hpp
+35
-0
composable_kernel/include/utility/amd_llvm_intrinsic.hpp
composable_kernel/include/utility/amd_llvm_intrinsic.hpp
+11
-0
composable_kernel/include/utility/common_header.hpp
composable_kernel/include/utility/common_header.hpp
+1
-0
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+2
-2
driver/include/device_dummy_dynamic_transform.hpp
driver/include/device_dummy_dynamic_transform.hpp
+105
-0
driver/include/device_dummy_static_transform.hpp
driver/include/device_dummy_static_transform.hpp
+11
-11
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+26
-13
driver/src/conv_driver.cu
driver/src/conv_driver.cu
+0
-1
No files found.
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
0 → 100644
View file @
d3405258
This diff is collapsed.
Click to expand it.
composable_kernel/include/kernel_algorithm/dummy_static_transform.hpp
View file @
d3405258
...
@@ -96,31 +96,26 @@ struct DummyStaticTransform
...
@@ -96,31 +96,26 @@ struct DummyStaticTransform
auto
coord
=
typename
TensorCoordinate
<
decltype
(
in_gemmk_gemmn_global_desc
)
>::
type
(
k0
,
n0
);
auto
coord
=
typename
TensorCoordinate
<
decltype
(
in_gemmk_gemmn_global_desc
)
>::
type
(
k0
,
n0
);
if
(
get_block_1d_id
()
<
coord
.
GetOffset
())
#pragma unroll 1
for
(
index_t
k
=
0
;
k
<
100
;
++
k
)
{
{
for
(
index_t
k
=
0
;
k
<
1
;
++
k
)
coord
+=
Array
<
index_t
,
2
>
{
8
,
0
};
{
for
(
index_t
n
=
0
;
n
<
4
;
++
n
)
Float
value
=
1
;
{
transfer_data
<
Float
,
auto
tmp
=
coord
+
Array
<
index_t
,
2
>
{
k
,
n
};
1
,
AddressSpace
::
Vgpr
,
Float
value
=
1
;
AddressSpace
::
Global
,
transfer_data
<
Float
,
InMemoryDataOperation
::
Set
,
1
,
1
,
AddressSpace
::
Vgpr
,
1
>
(
&
value
,
AddressSpace
::
Global
,
0
,
InMemoryDataOperation
::
Set
,
true
,
1
,
1
,
1
>
(
&
value
,
p_in_global
,
0
,
coord
.
GetOffset
(),
true
,
coord
.
IsOffsetValidAssumingUpperIndexIsValid
(),
1
,
in_gemmk_gemmn_global_desc
.
GetElementSpace
());
p_in_global
,
tmp
.
GetOffset
(),
tmp
.
IsOffsetValidAssumingUpperIndexIsValid
(),
in_gemmk_gemmn_global_desc
.
GetElementSpace
());
}
}
}
}
}
}
};
};
...
...
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
0 → 100644
View file @
d3405258
#ifndef CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HPP
#define CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HPP
#include "common_header.hpp"
namespace
ck
{
struct
DynamicPassThrough
{
using
LowerIndex
=
MultiIndex
<
1
>
;
using
UpperIndex
=
MultiIndex
<
1
>
;
index_t
low_length_
;
__host__
__device__
constexpr
DynamicPassThrough
(
index_t
low_length
)
:
low_length_
(
low_length
)
{
}
__host__
__device__
static
constexpr
auto
GetNumOfLowerDimension
()
{
return
Number
<
1
>
{};
}
__host__
__device__
static
constexpr
auto
GetNumOfUpperDimension
()
{
return
Number
<
1
>
{};
}
__host__
__device__
static
constexpr
auto
GetUpperLengths
()
{
return
Sequence
<
Length
>
{};
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
return
idx_up
;
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
return
idx_up_diff
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
{
return
true
;
}
};
template
<
index_t
NDimLow
>
struct
DynamicMerge
{
static
constexpr
index_t
ndim_low_
=
NDimLow
static
constexpr
index_t
ndim_up_
=
1
;
using
LowerIndex
=
MultiIndex
<
ndim_low_
>
;
using
UpperIndex
=
MultiIndex
<
ndum_up_
>
;
Array
<
index_t
,
NDimLow
>
low_lengths_
;
index_t
up_length_
;
__host__
__device__
static
constexpr
auto
GetNumOfLowerDimension
()
{
return
Number
<
ndim_low_
>
{};
}
__host__
__device__
static
constexpr
auto
GetNumOfUpperDimension
()
{
return
Number
<
ndim_up_
>
{};
}
__host__
__device__
static
constexpr
auto
GetUpperLengths
()
{
return
Array
<
index_t
,
1
>
up_length_
;
}
// emulate constexpr lambda
template
<
typename
PseudoLowStrides
>
struct
lambda_CalculateLowerIndex
{
index_t
&
itmp
;
LowerIndex
&
idx_low
;
__host__
__device__
explicit
constexpr
lambda_CalculateLowerIndex
(
index_t
&
itmp_
,
LowerIndex
&
idx_low_
)
:
itmp
(
itmp_
),
idx_low
(
idx_low_
)
{
}
template
<
typename
IDim
>
__host__
__device__
constexpr
void
operator
()(
IDim
idim
)
const
{
constexpr
index_t
stride
=
PseudoLowStrides
::
At
(
idim
);
idx_low
(
idim
)
=
itmp
/
stride
;
itmp
-=
idx_low
[
idim
]
*
stride
;
}
};
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
LowerIndex
idx_low
;
index_t
itmp
=
idx_up
[
0
];
constexpr
auto
pseudo_low_strides
=
reverse_inclusive_scan_sequence
(
LowerLengths
::
PopFront
(),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{})
.
PushBack
(
Number
<
1
>
{});
static_for
<
0
,
nDimLow
-
1
,
1
>
{}(
lambda_CalculateLowerIndex
<
decltype
(
pseudo_low_strides
)
>
(
itmp
,
idx_low
));
idx_low
(
nDimLow
-
1
)
=
itmp
/
pseudo_low_strides
[
nDimLow
-
1
];
return
idx_low
;
}
// idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
// If idx_up_diff is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
idx_low_old
)
{
if
(
idx_up_diff
[
0
]
==
0
)
{
return
make_zero_array
<
index_t
,
nDimLow
>
();
}
else
{
// CalculateLowerIndex(idx_up_diff) has multiple integer divisions.
// If idx_up_diff is known at compile-time, the calculation can
// be done at compile-time. However, if idx_up_diff is only known
// at run-time, then the calculation will also be computed at
// run-time, and can be very expensive.
LowerIndex
idx_low_diff_tmp
=
CalculateLowerIndex
(
idx_up_diff
);
// find out the last low dimension that changed
index_t
last_changed_low_dim
=
0
;
static_for
<
0
,
nDimLow
,
1
>
{}([
&
](
auto
i
)
{
if
(
idx_low_diff_tmp
[
i
]
!=
0
)
{
last_changed_low_dim
=
i
;
}
});
LowerIndex
idx_low_new
=
idx_low_old
+
idx_low_diff_tmp
;
if
(
idx_up_diff
[
0
]
>
0
)
{
// do carry check on each low dimension in reversed order
// starting from the first digit that changed
// don't check the highest dimension
bool
carry
=
false
;
static_for
<
nDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
if
(
i
<=
last_changed_low_dim
)
{
if
(
carry
)
{
++
idx_low_new
(
i
);
}
carry
=
false
;
if
(
idx_low_new
[
i
]
>=
LowerLengths
::
At
(
i
))
{
idx_low_new
(
i
)
-=
LowerLengths
::
At
(
i
);
carry
=
true
;
}
}
});
// highest dimension, no out-of-bound check
if
(
carry
)
{
++
idx_low_new
(
0
);
}
}
else
{
// do borrow check on each low dimension in reversed order
// starting from the first digit that changed
// don't check the highest dimension
bool
borrow
=
false
;
static_for
<
nDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
if
(
i
<=
last_changed_low_dim
)
{
if
(
borrow
)
{
--
idx_low_new
(
i
);
}
borrow
=
false
;
if
(
idx_low_new
[
i
]
<
0
)
{
idx_low_new
(
i
)
+=
LowerLengths
::
At
(
i
);
borrow
=
true
;
}
}
});
// highest dimension, no out-of-bound check
if
(
borrow
)
{
--
idx_low_new
(
0
);
}
}
return
idx_low_new
-
idx_low_old
;
}
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
false
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
{
return
true
;
}
};
}
// namespace ck
#endif
composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
0 → 100644
View file @
d3405258
#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_HPP
#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HPP
#include "common_header.hpp"
namespace
ck
{
template
<
index_t
NDim
>
struct
DynamicNativeTensorDescriptor
{
using
Index
=
MultiIndex
<
NDim
>
;
Array
<
index_t
,
NDim
>
lengths_
;
Array
<
index_t
,
NDim
>
strides_
;
index_t
element_size_
;
index_t
element_space_
;
template
<
typename
Lengths
,
typename
Strides
>
__host__
__device__
constexpr
DynamicNativeTensorDescriptor
(
const
Lengths
&
lengths
,
const
Strides
&
strides
)
:
lengths_
(
lengths
),
strides_
(
strides
)
{
element_size_
=
1
;
for
(
index_t
i
=
0
;
i
<
NDim
;
++
i
)
{
element_size_
*=
lengths_
[
i
];
}
element_space_
=
1
;
for
(
index_t
i
=
0
;
i
<
NDim
;
++
i
)
{
element_space_
+=
(
lengths_
[
i
]
-
1
)
*
strides_
[
i
];
}
}
__host__
__device__
static
constexpr
auto
GetNumOfDimension
()
{
return
NDim
;
}
__host__
__device__
constexpr
auto
GetLength
(
const
index_t
&
i
)
const
{
return
lengths_
[
i
];
}
__host__
__device__
constexpr
auto
GetStride
(
const
index_t
&
i
)
const
{
return
strides_
[
i
];
}
__host__
__device__
constexpr
auto
GetLengths
()
const
{
return
lengths_
;
}
__host__
__device__
constexpr
auto
GetStrides
()
const
{
return
strides_
;
}
__host__
__device__
constexpr
auto
GetElementSize
()
const
{
return
element_size_
;
}
__host__
__device__
constexpr
auto
GetElementSpace
()
const
{
return
element_space_
;
}
__host__
__device__
constexpr
auto
CalculateOffset
(
const
Index
&
idx
)
const
{
index_t
offset
=
0
;
#pragma unroll
for
(
index_t
i
=
0
;
i
<
NDim
;
++
i
)
{
offset
+=
idx
[
i
]
*
strides_
[
i
];
}
return
offset
;
}
__host__
__device__
constexpr
auto
CalculateOffsetDiff
(
const
Index
&
idx_diff
)
const
{
index_t
offset_diff
=
0
;
#pragma unroll
for
(
index_t
i
=
0
;
i
<
NDim
;
++
i
)
{
offset_diff
+=
idx_diff
[
i
]
*
strides_
[
i
];
}
return
offset_diff
;
}
__host__
__device__
constexpr
bool
IsUpperIndexValid
(
const
Index
&
idx
)
const
{
bool
flag
=
true
;
#pragma unroll
for
(
index_t
i
=
0
;
i
<
NDim
;
++
i
)
{
flag
=
flag
&&
idx
[
i
]
>=
0
&&
idx
[
i
]
<
lengths_
[
i
];
}
return
flag
;
}
};
#if 0
// Tensor descriptor for "transformed tensor"
template <typename LowTensorDescriptor,
typename Transforms, // Tuple<DynamicMultIndexTransforms,...>
typename LowDimensions, // Tuple<Sequence<...>,...>
typename UpDimensions> // Tuple<Sequence<...>,...>
struct DynamicTransformedTensorDescriptor
{
using Type = DynamicTransformedTensorDescriptor;
__host__ __device__ static constexpr auto GetNumOfLowerDimension()
{
// Here, we assume all lower-dimensions are active
// TODO: sanity-check all lower-dimension are indeed active
using duplicated_low_active_dims =
decltype(unpack(lambda_merge_sequences{}, LowDimensions{}));
using low_active_dims = typename sequence_unique_sort<duplicated_low_active_dims,
math::less<index_t>,
math::equal<index_t>>::type;
return low_active_dims::Size();
}
__host__ __device__ static constexpr auto GetNumOfUpperDimension()
{
using duplicated_up_active_dims =
decltype(unpack(lambda_merge_sequences{}, UpDimensions{}));
using up_active_dims = typename sequence_unique_sort<duplicated_up_active_dims,
math::less<index_t>,
math::equal<index_t>>::type;
return up_active_dims::Size();
}
static constexpr index_t ndim_up_ = GetNumOfUpperDimension();
static constexpr index_t ndim_low_ = GetNumOfLowerDimension();
static constexpr index_t num_transform_ = Transforms::Size();
using UpperIndex = MultiIndex<ndim_up_>;
using LowerIndex = MultiIndex<ndim_low_>;
const LowTensorDescriptor low_tensor_desc_;
const Transforms transforms_;
const LowDimensions low_dims_;
const UpDimensions up_dims_;
__host__ __device__ constexpr TransformedTensorDescriptor(const LowTensorDescriptor& low_tensor_desc,
const Transforms& transforms)
: low_tensor_desc_(low_tensor_desc),
transforms_(transforms)
{
}
__host__ __device__ static constexpr auto GetNumOfDimension()
{
return GetNumOfUpperDimension();
}
__host__ __device__ constexpr auto GetLowerTensorDescriptor() const
{
return low_dims_;
}
__host__ __device__ constexpr auto GetUpperLengths() cons
{
}
__host__ __device__ constexpr auto GetLengths() const { return GetUpperLengths(); }
__host__ __device__ constexpr auto GetLength(index_t i) const
{
return GetLengths()[i];
}
__host__ __device__ constexpr auto GetElementSize() const
{
index_t element_size = 1;
for(index_t i = 0; i < ndim_up_; ++i)
{
element_size *= GetLength(i);
}
return element_size;
}
__host__ __device__ constexpr auto GetElementSpace() const
{
return lower_tensor_desc_.GetElementSpace();
}
// TODO: right now return value is not constexpr because use of non-constexpr lambda
__host__ __device__ constexpr LowerIndex CalculateLowerIndex(const UpperIndex& idx_up) const
{
LowerIndex idx_low;
static_for<0, num_transform_, 1>{}([&](auto itran) {
constexpr auto tran = Transforms{}.At(itran);
const auto idx_up_part = pick_array_element(idx_up, UpDimensions{}.At(itran));
auto idx_low_part = pick_array_element(idx_low, LowDimensions{}.At(itran));
// this assume each lower (single) index is only assocaited with one transformation,
// which is required for index transformation, and has been checked during constructor
// of TransformedTensorDescriptor
idx_low_part = tran.CalculateLowerIndex(to_array(idx_up_part));
});
return idx_low;
}
// TODO: right now return value is not constexpr because use of non-constepxr lambda
__host__ __device__ static constexpr LowerIndex CalculateLowerIndexDiff(
const UpperIndex& idx_up_diff, const UpperIndex& idx_up_old, const LowerIndex& idx_low_old)
{
LowerIndex idx_low_diff;
static_for<0, nTransform, 1>{}([&](auto itran) {
constexpr auto tran = Transforms{}.At(itran);
const auto idx_up_diff_part =
pick_array_element(idx_up_diff, UpDimensions{}.At(itran));
const auto idx_up_old_part = pick_array_element(idx_up_old, UpDimensions{}.At(itran));
const auto idx_low_old_part =
pick_array_element(idx_low_old, LowDimensions{}.At(itran));
auto idx_low_diff_part = pick_array_element(idx_low_diff, LowDimensions{}.At(itran));
// this assume each lower (single) index is associated with only one transformation,
// which is required for index transformation, and has been checked during constructor
// of TransformedTensorDescriptor
idx_low_diff_part = tran.CalculateLowerIndexDiff(
to_array(idx_up_diff_part), to_array(idx_up_old_part), to_array(idx_low_old_part));
});
return idx_low_diff;
}
__host__ __device__ static constexpr index_t CalculateOffset(const UpperIndex& idx_up)
{
return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up));
}
};
#endif
}
// namespace ck
#endif
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
0 → 100644
View file @
d3405258
#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_HPP
#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_HPP
#include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp"
namespace
ck
{
template
<
typename
Lengths
,
typename
Strides
>
__host__
__device__
constexpr
auto
make_dynamic_native_tensor_descriptor
(
const
Lengths
&
lengths
,
const
Strides
&
strides
)
{
static_assert
(
Lengths
::
GetSize
()
==
Strides
::
GetSize
(),
"wrong! Size not the same"
);
return
DynamicNativeTensorDescriptor
<
Lengths
::
GetSize
()
>
(
lengths
,
strides
);
}
template
<
typename
LowTensorDescriptor
,
typename
Transforms
,
typename
LowDimensions
,
typename
UpDimensions
>
__host__
__device__
constexpr
auto
transform_dynamic_tensor_descriptor
(
const
LowTensorDescriptor
&
low_tensor_desc
,
const
Transforms
&
transforms
,
LowDimensions
,
UpDimensions
)
{
return
DynamicTransformedTensorDescriptor
<
LowTensorDescriptor
,
Transforms
,
LowDimensions
,
UpDimensions
>
(
low_tensor_desc
,
transforms
);
}
}
// namespace ck
#endif
composable_kernel/include/utility/amd_llvm_intrinsic.hpp
0 → 100644
View file @
d3405258
#ifndef CK_AMD_LLVM_INTRINSIC_HPP
#define CK_AMD_LLVM_INTRINSIC_HPP
#include "float_type.hpp"
namespace
ck
{
__device__
int32_t
__llvm_amdgcn_readfirstlane_i32
(
int32_t
i
)
__asm
(
"llvm.amdgcn.readfirstlane"
);
}
// namespace ck
#endif
composable_kernel/include/utility/common_header.hpp
View file @
d3405258
...
@@ -20,6 +20,7 @@
...
@@ -20,6 +20,7 @@
#if CK_USE_AMD_INLINE_ASM
#if CK_USE_AMD_INLINE_ASM
#include "amd_inline_asm.hpp"
#include "amd_inline_asm.hpp"
#include "amd_llvm_intrinsic.hpp"
#endif
#endif
#if CK_USE_AMD_XDLOPS
#if CK_USE_AMD_XDLOPS
...
...
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
d3405258
...
@@ -172,7 +172,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -172,7 +172,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
4
;
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
4
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
#elif
0
#elif
1
// cdata = 64, BlockSize = 256, 128x128x16
// cdata = 64, BlockSize = 256, 128x128x16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
...
@@ -290,7 +290,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -290,7 +290,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
2
;
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
2
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
#elif
1
#elif
0
// cdata = 64, BlockSize = 128, 64x128x8
// cdata = 64, BlockSize = 128, 64x128x8
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
BlockSize
=
128
;
...
...
driver/include/device_dummy_dynamic_transform.hpp
0 → 100644
View file @
d3405258
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "dummy_dynamic_transform.hpp"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
,
class
ConvStrides
,
class
ConvDilations
,
class
InLeftPads
,
class
InRightPads
>
void
device_dummy_dynamic_transform
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
ConvStrides
,
ConvDilations
,
InLeftPads
,
InRightPads
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
using
TDevice
=
typename
conditional
<
is_same
<
half_float
::
half
,
T
>::
value
,
half_t
,
T
>::
type
;
const
auto
in_nchw_desc
=
make_dynamic_native_tensor_descriptor
(
to_array
(
InDesc
::
GetLengths
()),
to_array
(
InDesc
::
GetStrides
()));
const
auto
wei_kcyx_desc
=
make_dynamic_native_tensor_descriptor
(
to_array
(
WeiDesc
::
GetLengths
()),
to_array
(
WeiDesc
::
GetStrides
()));
const
auto
out_nkhw_desc
=
make_dynamic_native_tensor_descriptor
(
to_array
(
OutDesc
::
GetLengths
()),
to_array
(
OutDesc
::
GetStrides
()));
const
auto
conv_strides
=
to_array
(
ConvStrides
{});
const
auto
conv_dilations
=
to_array
(
ConvDilations
{});
const
auto
in_left_pads
=
to_array
(
InLeftPads
{});
const
auto
in_right_pads
=
to_array
(
InRightPads
{});
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_nchw_device_buf
(
data_sz
*
in_nchw
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_kcyx_device_buf
(
data_sz
*
wei_kcyx
.
mDesc
.
GetElementSpace
());
DeviceMem
out_nkhw_device_buf
(
data_sz
*
out_nkhw
.
mDesc
.
GetElementSpace
());
in_nchw_device_buf
.
ToDevice
(
in_nchw
.
mData
.
data
());
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
GridSize
=
1
;
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
using
dummy_transform
=
DummyDynamicTransform
<
BlockSize
>
;
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
std
::
cout
<<
"Start running "
<<
nrepeat
<<
" times..."
<<
std
::
endl
;
KernelTimer
timer
;
timer
.
Start
();
for
(
index_t
j
=
0
;
j
<
nrepeat
;
++
j
)
{
launch_kernel
(
run_gridwise_operation
<
dummy_transform
,
index_t
*
const
,
index_t
*
const
,
float
*
const
,
const
DynamicNativeTensorDescriptor
<
4
>
,
const
DynamicNativeTensorDescriptor
<
4
>
,
const
DynamicNativeTensorDescriptor
<
4
>
,
const
Array
<
index_t
,
2
>
,
const
Array
<
index_t
,
2
>
,
const
Array
<
index_t
,
2
>
,
const
Array
<
index_t
,
2
>
,
index_t
,
index_t
,
index_t
,
index_t
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
static_cast
<
index_t
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
static_cast
<
index_t
*>
(
wei_kcyx_device_buf
.
GetDeviceBuffer
()),
static_cast
<
float
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()),
wei_kcyx_desc
,
in_nchw_desc
,
out_nkhw_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
,
10
,
10
,
10
,
10
);
}
}
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
}
driver/include/device_dummy_transform.hpp
→
driver/include/device_dummy_
static_
transform.hpp
View file @
d3405258
...
@@ -12,17 +12,17 @@ template <class T,
...
@@ -12,17 +12,17 @@ template <class T,
class
ConvDilations
,
class
ConvDilations
,
class
InLeftPads
,
class
InLeftPads
,
class
InRightPads
>
class
InRightPads
>
void
device_dummy_transform
(
InDesc
,
void
device_dummy_
static_
transform
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
const
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
Tensor
<
T
>&
out_nkhw
,
ConvStrides
,
ConvStrides
,
ConvDilations
,
ConvDilations
,
InLeftPads
,
InLeftPads
,
InRightPads
,
InRightPads
,
ck
::
index_t
nrepeat
)
ck
::
index_t
nrepeat
)
{
{
using
namespace
ck
;
using
namespace
ck
;
...
...
driver/src/conv_driver.cpp
View file @
d3405258
...
@@ -14,7 +14,8 @@
...
@@ -14,7 +14,8 @@
#include "device_tensor.hpp"
#include "device_tensor.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_dummy_transform.hpp"
#include "device_dummy_static_transform.hpp"
#include "device_dummy_dynamic_transform.hpp"
int
main
(
int
argc
,
char
*
argv
[])
int
main
(
int
argc
,
char
*
argv
[])
{
{
...
@@ -200,7 +201,7 @@ int main(int argc, char* argv[])
...
@@ -200,7 +201,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
0
#elif
1
// 3x3, 35x35, stride 2
// 3x3, 35x35, stride 2
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
288
;
constexpr
index_t
C
=
288
;
...
@@ -572,18 +573,30 @@ int main(int argc, char* argv[])
...
@@ -572,18 +573,30 @@ int main(int argc, char* argv[])
LeftPads
{},
LeftPads
{},
RightPads
{},
RightPads
{},
nrepeat
);
nrepeat
);
#elif 0
device_dummy_static_transform
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{},
nrepeat
);
#elif 1
#elif 1
device_dummy_transform
(
in_nchw_desc
,
device_dummy_
dynamic_
transform
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
wei_kcyx
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_desc
,
out_nkhw_device
,
out_nkhw_device
,
ConvStrides
{},
ConvStrides
{},
ConvDilations
{},
ConvDilations
{},
LeftPads
{},
LeftPads
{},
RightPads
{},
RightPads
{},
nrepeat
);
nrepeat
);
#endif
#endif
if
(
do_verification
)
if
(
do_verification
)
...
...
driver/src/conv_driver.cu
deleted
120000 → 0
View file @
834eb24c
conv_driver
.
cpp
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment