Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
ff2c373b
Commit
ff2c373b
authored
Nov 18, 2019
by
Chao Liu
Browse files
amd build
parent
e5874b3f
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
117 additions
and
94 deletions
+117
-94
composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
...l/include/tensor_description/ConstantMatrixDescriptor.hpp
+1
-1
composable_kernel/include/tensor_description/tensor_coordinate.hpp
...e_kernel/include/tensor_description/tensor_coordinate.hpp
+2
-2
composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
...clude/tensor_description/tensor_coordinate_deprecated.hpp
+2
-2
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
...l/include/tensor_description/tensor_descriptor_helper.hpp
+6
-6
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+12
-18
composable_kernel/include/utility/config.amd.hpp.in
composable_kernel/include/utility/config.amd.hpp.in
+7
-0
composable_kernel/include/utility/float_type.amd.hpp.in
composable_kernel/include/utility/float_type.amd.hpp.in
+0
-53
composable_kernel/include/utility/functional.hpp
composable_kernel/include/utility/functional.hpp
+2
-4
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
...ble_kernel/include/utility/in_memory_operation.amd.hpp.in
+78
-0
composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in
..._kernel/include/utility/in_memory_operation.nvidia.hpp.in
+7
-8
No files found.
composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
View file @
ff2c373b
...
...
@@ -60,7 +60,7 @@ __host__ __device__ constexpr auto
template
<
typename
...
Ts
>
__host__
__device__
constexpr
auto
make_ConstantMatrixDescriptor
(
ConstantTensorDescriptor_deprecated
<
Ts
...
>
)
make_ConstantMatrixDescriptor
(
ConstantTensorDescriptor_deprecated
<
Ts
...
>
)
{
using
TDesc
=
ConstantTensorDescriptor_deprecated
<
Ts
...
>
;
static_assert
(
TDesc
::
GetNumOfDimension
()
==
2
,
"wrong"
);
...
...
composable_kernel/include/tensor_description/tensor_coordinate.hpp
View file @
ff2c373b
...
...
@@ -228,7 +228,7 @@ struct TensorCoordinate
private:
template
<
typename
...
Ts
>
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
NativeTensorDescriptor
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
NativeTensorDescriptor
<
Ts
...
>
)
{
return
NativeTensorCoordinate
<
NativeTensorDescriptor
<
Ts
...
>>
(
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
...
...
@@ -236,7 +236,7 @@ struct TensorCoordinate
template
<
typename
...
Ts
>
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
TransformedTensorDescriptor
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
TransformedTensorDescriptor
<
Ts
...
>
)
{
return
TransformedTensorCoordinate
<
TransformedTensorDescriptor
<
Ts
...
>>
(
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
...
...
composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
View file @
ff2c373b
...
...
@@ -327,14 +327,14 @@ struct TensorCoordinate_deprecated
private:
template
<
class
...
Ts
>
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
ConstantTensorDescriptor_deprecated
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
ConstantTensorDescriptor_deprecated
<
Ts
...
>
)
{
return
NormalTensorCoordinate_deprecated
<
ConstantTensorDescriptor_deprecated
<
Ts
...
>>
();
}
template
<
class
...
Ts
>
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
ConstantMergedTensorDescriptor_deprecated
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
ConstantMergedTensorDescriptor_deprecated
<
Ts
...
>
)
{
return
MergedTensorCoordinate_deprecated
<
ConstantMergedTensorDescriptor_deprecated
<
Ts
...
>>
();
...
...
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
View file @
ff2c373b
...
...
@@ -64,10 +64,10 @@ template <typename LowerTensorDescriptor,
index_t
...
LowerDimensionIds
,
index_t
...
UpperDimensionIds
>
__host__
__device__
constexpr
auto
reorder_transformed_tensor_descriptor_impl
(
LowerTensorDescriptor
,
Sequence
<
LowerLengths
...
>
,
Sequence
<
LowerDimensionIds
...
>
,
Sequence
<
UpperDimensionIds
...
>
)
reorder_transformed_tensor_descriptor_impl
(
LowerTensorDescriptor
,
Sequence
<
LowerLengths
...
>
,
Sequence
<
LowerDimensionIds
...
>
,
Sequence
<
UpperDimensionIds
...
>
)
{
return
TransformedTensorDescriptor
<
LowerTensorDescriptor
,
Tuple
<
PassThrough
<
LowerLengths
>
...
>
,
...
...
@@ -78,7 +78,7 @@ reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor,
// reorder a NativeTensorDescriptor
template
<
typename
...
Ts
,
typename
MapLower2Upper
>
__host__
__device__
constexpr
auto
reorder_tensor_descriptor_given_lower2upper
(
NativeTensorDescriptor
<
Ts
...
>
,
MapLower2Upper
)
reorder_tensor_descriptor_given_lower2upper
(
NativeTensorDescriptor
<
Ts
...
>
,
MapLower2Upper
)
{
static_assert
(
is_valid_sequence_map
<
MapLower2Upper
>
{},
"wrong! MapLower2Upper is not a valid map"
);
...
...
@@ -96,7 +96,7 @@ reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor<Ts...>, MapLo
// reorder a TransformedTensorDescriptor
template
<
typename
...
Ts
,
typename
MapLower2Upper
>
__host__
__device__
constexpr
auto
reorder_tensor_descriptor_given_lower2upper
(
TransformedTensorDescriptor
<
Ts
...
>
,
MapLower2Upper
)
reorder_tensor_descriptor_given_lower2upper
(
TransformedTensorDescriptor
<
Ts
...
>
,
MapLower2Upper
)
{
static_assert
(
is_valid_sequence_map
<
MapLower2Upper
>
{},
"wrong! MapLower2Upper is not a valid map"
);
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
ff2c373b
...
...
@@ -166,28 +166,22 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// has the same padding situation
if
(
dst_coord
.
IsUpperIndexMappedToValidOffset
())
{
#if 0
static_if<!DoAtomicAdd>{}([&](auto) {
static_if<DstAddressSpace == AddressSpace::global>{}([&](auto fwd) {
#if 0 // debug
static_if<DstAddressSpace == AddressSpace::global>{}([&](auto fwd) {
#if CK_USE_AMD_BUFFER_ADDRESSING
amd_intrinsic_buffer_store<DstData, DstDataPerAccess>(
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
fwd(p_dst),
dst_coord.GetOffset(),
0);
amd_intrinsic_buffer_store<DstData, DstDataPerAccess>(
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
fwd(p_dst),
dst_coord.GetOffset(),
0);
#else
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
#endif
}).
Else
([
&
](
auto
)
{
// dst can be all kinds of memory-space
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_coord
.
GetOffset
()])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]);
});
}).
Else
([
&
](
auto
)
{
atomicAdd
(
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_coord
.
GetOffset
()])
,
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
])
)
;
// dst can be all kinds of memory-space
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_coord
.
GetOffset
()])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]);
});
#else
move_data
<
DstData
,
...
...
composable_kernel/include/utility/config.amd.hpp.in
View file @
ff2c373b
...
...
@@ -55,9 +55,16 @@ enum AddressSpace
{
generic,
global,
lds,
vgpr
};
enum InMemoryDataOperation
{
none,
atomic_add
};
#if CK_UNSIGNED_INDEX_TYPE
using index_t = uint32_t;
#else
...
...
composable_kernel/include/utility/float_type.amd.hpp.in
View file @
ff2c373b
...
...
@@ -307,58 +307,5 @@ struct inner_product_with_conversion
}
};
template <DataMovement Movement, AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
void move_data(const float* p_src,
index_t src_offset,
float* p_dst,
dst_offset,
integral_constant<DataMovement, Movement>,
integral_constant<AddressSpace, SrcAddressSpace> src_address_space,
integral_constant<AddressSpace, DstAddressSpace> dst_address_space)
{
// TODO: use static_if::ElseIf
static_if<Movement == DataMovement::copy>{}([&](auto) {
copy_data(p_src, src_offset, p_dst, dst_offset, src_address_space, dst_address_space);
});
static_if<Movement == DataMovement::atomic_add>{}([&](auto) {
atomic_add_data(p_src, src_offset, p_dst, dst_offset, src_address_space, dst_address_space);
});
}
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
void copy_data(const float* p_src,
index_t src_offset,
float* p_dst,
dst_offset,
integral_constant<AddressSpace, SrcAddressSpace>,
integral_constant<AddressSpace, DstAddressSpace>)
{
static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == AddressSpace::global>{}(
[&](auto fwd) {
#if CK_USE_AMD_BUFFER_ADDRESSING
amd_intrinsic_buffer_store(p_src[src_offset], fwd(p_dst), dst_offset, 0);
#else
p_dst[dst_offset] = p_src[src_offset];
#endif
})
.Else([&](auto) { p_dst[dst_offset] = p_src[src_offset]; });
}
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
void atomic_add_data(const float* p_src,
index_t src_offset,
float* p_dst,
dst_offset,
integral_constant<AddressSpace, SrcAddressSpace>,
integral_constant<AddressSpace, DstAddressSpace>)
{
static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == AddressSpace::global>{}(
[&](auto fwd) { atomicAdd(&(p_dst[dst_offset]), p_src[src_offset]); })
.Else([&](auto fwd) {
static_assert(fwd(false), "atomic_add doesn't support this memory space");
});
}
} // namespace ck
#endif
composable_kernel/include/utility/functional.hpp
View file @
ff2c373b
...
...
@@ -64,9 +64,8 @@ struct static_if<true>
}
template
<
typename
F
>
__host__
__device__
static
constexpr
auto
Else
(
F
)
__host__
__device__
static
void
Else
(
F
)
{
return
Type
{};
}
};
...
...
@@ -82,14 +81,13 @@ struct static_if<false>
}
template
<
typename
F
>
__host__
__device__
static
constexpr
auto
Else
(
F
f
)
__host__
__device__
static
void
Else
(
F
f
)
{
// This is a trick for compiler:
// Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will use it,
// this will make "f" a generic lambda, so that "f" won't be compiled until being
// instantiated here
f
(
forwarder
{});
return
Type
{};
}
};
...
...
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
0 → 100644
View file @
ff2c373b
#ifndef CK_IN_MEMORY_OPERATION_AMD_HPP
#define CK_IN_MEMORY_OPERATION_AMD_HPP
#include "float_type.hpp"
#include "amd_buffer_addressing.hpp"
namespace ck {
template <typename T,
index_t DataPerAccess,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace>
__device__ void copy_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
{
using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
#if CK_USE_AMD_BUFFER_ADDRESSING
// TODO: use static_if::ElseIf
static_if<SrcAddressSpace == AddressSpace::global && DstAddressSpace == vgpr>{}([&](auto) {
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
amd_intrinsic_buffer_load<T, DataPerAccess>(p_src, src_offset, 0);
}).Else([&](auto) {
static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == global>{}([&](auto) {
amd_intrinsic_buffer_store<T, DataPerAccess>(
*reinterpret_cast<const vector_t*>(&p_src[src_offset]), p_dst, dst_offset, 0);
}).Else([&](auto) {
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
});
});
#else
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
#endif
}
template <typename T,
index_t DataPerAccess,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace>
__device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
{
using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
static_if<SrcAddressSpace == AddressSpace::vgpr &&
DstAddressSpace == AddressSpace::global>{}([&](auto) {
atomicAdd(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
*reinterpret_cast<const vector_t*>(&p_src[src_offset]));
}).Else([&](auto fwd) {
static_assert(fwd(false), "atomic_add doesn't support this memory space");
});
}
template <typename T,
index_t DataPerAccess,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace,
InMemoryDataOperation DstInMemOp>
__device__ void move_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
{
static_assert(DstInMemOp == InMemoryDataOperation::none ||
DstInMemOp == InMemoryDataOperation::atomic_add,
"wrong! InMemoryDataOperation not supported!");
// TODO: use static_if::ElseIf
static_if<DstInMemOp == InMemoryDataOperation::none>{}([&](auto) {
copy_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset);
});
static_if<DstInMemOp == InMemoryDataOperation::atomic_add>{}([&](auto) {
atomic_add_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset);
});
}
} // namespace ck
#endif
composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in
View file @
ff2c373b
...
...
@@ -23,14 +23,13 @@ __device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, in
{
using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == AddressSpace::global>{}(
[&](auto) {
atomicAdd(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
*reinterpret_cast<const vector_t*>(&p_src[src_offset]));
})
.Else([&](auto fwd) {
static_assert(fwd(false), "atomic_add doesn't support this memory space");
});
static_if<SrcAddressSpace == AddressSpace::vgpr &&
DstAddressSpace == AddressSpace::global>{}([&](auto) {
atomicAdd(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
*reinterpret_cast<const vector_t*>(&p_src[src_offset]));
}).Else([&](auto fwd) {
static_assert(fwd(false), "atomic_add doesn't support this memory space");
});
}
template <typename T,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment