Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
ca8ba252
Commit
ca8ba252
authored
Feb 25, 2021
by
Chao Liu
Browse files
refactor
parent
edc08fe6
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
40 additions
and
42 deletions
+40
-42
composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
...ernel/include/tensor_operation/blockwise_batched_gemm.hpp
+1
-1
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
...or_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+4
-4
composable_kernel/include/tensor_operation/threadwise_gemm.hpp
...sable_kernel/include/tensor_operation/threadwise_gemm.hpp
+1
-1
composable_kernel/include/tensor_operation/threadwise_gemm_v2.hpp
...le_kernel/include/tensor_operation/threadwise_gemm_v2.hpp
+1
-1
composable_kernel/include/utility/amd_buffer_addressing.hpp
composable_kernel/include/utility/amd_buffer_addressing.hpp
+4
-5
composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
...sable_kernel/include/utility/amd_buffer_addressing_v2.hpp
+6
-7
composable_kernel/include/utility/float_type.amd.hpp.in
composable_kernel/include/utility/float_type.amd.hpp.in
+4
-4
composable_kernel/include/utility/float_type.nvidia.hpp.in
composable_kernel/include/utility/float_type.nvidia.hpp.in
+15
-15
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
...ble_kernel/include/utility/in_memory_operation.amd.hpp.in
+2
-2
composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in
..._kernel/include/utility/in_memory_operation.nvidia.hpp.in
+2
-2
No files found.
composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
View file @
ca8ba252
...
@@ -305,7 +305,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
...
@@ -305,7 +305,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
"Run_amd_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == "
"Run_amd_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == "
"1 for now
\n
"
);
"1 for now
\n
"
);
using
Float4
=
vector_type
<
float
,
4
>::
MemoryT
ype
;
using
Float4
=
vector_type
<
float
,
4
>::
t
ype
;
Float4
*
reg_a
=
(
Float4
*
)(
p_a_thread
);
Float4
*
reg_a
=
(
Float4
*
)(
p_a_thread
);
Float4
*
reg_b
=
(
Float4
*
)(
p_b_thread
);
Float4
*
reg_b
=
(
Float4
*
)(
p_b_thread
);
...
...
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
View file @
ca8ba252
...
@@ -175,7 +175,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
...
@@ -175,7 +175,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
// copy data
// copy data
vector_type
<
DstData
,
DstScalarPerVector
>
dst_vector
;
vector_type
<
DstData
,
DstScalarPerVector
>
dst_vector
;
using
dst_vector_t
=
typename
vector_type
<
DstData
,
DstScalarPerVector
>::
MemoryT
ype
;
using
dst_vector_t
=
typename
vector_type
<
DstData
,
DstScalarPerVector
>::
t
ype
;
static_for
<
0
,
DstScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
DstScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
constexpr
index_t
src_offset
=
constexpr
index_t
src_offset
=
...
@@ -504,7 +504,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2
...
@@ -504,7 +504,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2
vector_type
<
SrcData
,
SrcScalarPerVector
>
src_vector
;
vector_type
<
SrcData
,
SrcScalarPerVector
>
src_vector
;
using
src_vector_t
=
typename
vector_type
<
SrcData
,
SrcScalarPerVector
>::
MemoryT
ype
;
using
src_vector_t
=
typename
vector_type
<
SrcData
,
SrcScalarPerVector
>::
t
ype
;
const
bool
is_src_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
const
bool
is_src_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_coord_
);
src_desc
,
src_slice_origin_coord_
);
...
@@ -838,7 +838,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -838,7 +838,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
// copy data
// copy data
vector_type
<
SrcData
,
SrcScalarPerVector
>
src_vector
;
vector_type
<
SrcData
,
SrcScalarPerVector
>
src_vector
;
using
src_vector_t
=
typename
vector_type
<
SrcData
,
SrcScalarPerVector
>::
MemoryT
ype
;
using
src_vector_t
=
typename
vector_type
<
SrcData
,
SrcScalarPerVector
>::
t
ype
;
const
bool
is_src_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
const
bool
is_src_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_coord_
);
src_desc
,
src_slice_origin_coord_
);
...
@@ -1031,7 +1031,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -1031,7 +1031,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
dst_vector
.
Scalars
()(
i
)
=
buffer_
[
Number
<
buffer_offset
>
{}];
dst_vector
.
Scalars
()(
i
)
=
buffer_
[
Number
<
buffer_offset
>
{}];
});
});
using
DstVectorType
=
typename
vector_type
<
DstData
,
DstScalarPerVector
>::
MemoryT
ype
;
using
DstVectorType
=
typename
vector_type
<
DstData
,
DstScalarPerVector
>::
t
ype
;
*
reinterpret_cast
<
DstVectorType
*>
(
p_dst
+
dst_slice_origin_coord_
.
GetOffset
())
=
*
reinterpret_cast
<
DstVectorType
*>
(
p_dst
+
dst_slice_origin_coord_
.
GetOffset
())
=
dst_vector
.
Vector
();
dst_vector
.
Vector
();
...
...
composable_kernel/include/tensor_operation/threadwise_gemm.hpp
View file @
ca8ba252
...
@@ -39,7 +39,7 @@ struct ThreadwiseMatrixSliceCopy
...
@@ -39,7 +39,7 @@ struct ThreadwiseMatrixSliceCopy
template
<
typename
Data
>
template
<
typename
Data
>
__device__
static
void
Run
(
const
Data
*
p_src
,
Data
*
p_dst
)
__device__
static
void
Run
(
const
Data
*
p_src
,
Data
*
p_dst
)
{
{
using
vector_t
=
typename
vector_type
<
Data
,
DataPerAccess
>::
MemoryT
ype
;
using
vector_t
=
typename
vector_type
<
Data
,
DataPerAccess
>::
t
ype
;
for
(
index_t
i
=
0
;
i
<
NSliceRow
;
++
i
)
for
(
index_t
i
=
0
;
i
<
NSliceRow
;
++
i
)
{
{
...
...
composable_kernel/include/tensor_operation/threadwise_gemm_v2.hpp
View file @
ca8ba252
...
@@ -41,7 +41,7 @@ struct ThreadwiseMatrixSliceCopy_v2
...
@@ -41,7 +41,7 @@ struct ThreadwiseMatrixSliceCopy_v2
static_assert
(
SrcDesc
::
IsKnownAtCompileTime
()
&&
DstDesc
::
IsKnownAtCompileTime
(),
static_assert
(
SrcDesc
::
IsKnownAtCompileTime
()
&&
DstDesc
::
IsKnownAtCompileTime
(),
"wrong! Desc should be known at compile-time"
);
"wrong! Desc should be known at compile-time"
);
using
vector_t
=
typename
vector_type
<
Data
,
DataPerAccess
>::
MemoryT
ype
;
using
vector_t
=
typename
vector_type
<
Data
,
DataPerAccess
>::
t
ype
;
static_for
<
0
,
NSliceRow
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
NSliceRow
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
NSliceCol
,
DataPerAccess
>
{}([
&
](
auto
j
)
{
static_for
<
0
,
NSliceCol
,
DataPerAccess
>
{}([
&
](
auto
j
)
{
...
...
composable_kernel/include/utility/amd_buffer_addressing.hpp
View file @
ca8ba252
...
@@ -91,11 +91,10 @@ __llvm_amdgcn_buffer_atomic_add_f32(float vdata,
...
@@ -91,11 +91,10 @@ __llvm_amdgcn_buffer_atomic_add_f32(float vdata,
// 2) p_src_wave to be a wavewise pointer.
// 2) p_src_wave to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
// It is user's responsibility to make sure that is true.
template
<
typename
T
,
index_t
VectorSize
>
template
<
typename
T
,
index_t
VectorSize
>
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
__device__
typename
vector_type
<
T
,
VectorSize
>::
type
amd_buffer_load
(
const
T
*
p_src_wave
,
amd_buffer_load
(
const
T
*
p_src_wave
,
index_t
src_thread_data_offset
,
index_t
src_thread_data_offset
,
bool
src_thread_data_valid
,
bool
src_thread_data_valid
,
index_t
src_elemenst_space
);
index_t
src_elemenst_space
);
// buffer_store requires:
// buffer_store requires:
// 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
// 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
...
...
composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
View file @
ca8ba252
...
@@ -60,7 +60,7 @@ __llvm_amdgcn_raw_buffer_store_fp32x4(float4_t vdata,
...
@@ -60,7 +60,7 @@ __llvm_amdgcn_raw_buffer_store_fp32x4(float4_t vdata,
// 2) p_src_wave to be a wavewise pointer.
// 2) p_src_wave to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
// It is user's responsibility to make sure that is true.
template
<
typename
T
,
index_t
VectorSize
>
template
<
typename
T
,
index_t
VectorSize
>
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryT
ype
__device__
typename
vector_type
<
T
,
VectorSize
>::
t
ype
amd_buffer_load_v2
(
const
T
*
p_src_wave
,
amd_buffer_load_v2
(
const
T
*
p_src_wave
,
index_t
src_thread_data_offset
,
index_t
src_thread_data_offset
,
bool
src_thread_data_valid
,
bool
src_thread_data_valid
,
...
@@ -71,12 +71,11 @@ amd_buffer_load_v2(const T* p_src_wave,
...
@@ -71,12 +71,11 @@ amd_buffer_load_v2(const T* p_src_wave,
// 2) p_dst_wave to be a wavewise pointer.
// 2) p_dst_wave to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
// It is user's responsibility to make sure that is true.
template
<
typename
T
,
index_t
VectorSize
>
template
<
typename
T
,
index_t
VectorSize
>
__device__
void
__device__
void
amd_buffer_store_v2
(
const
typename
vector_type
<
T
,
VectorSize
>::
type
src_thread_data
,
amd_buffer_store_v2
(
const
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
src_thread_data
,
T
*
p_dst_wave
,
T
*
p_dst_wave
,
const
index_t
dst_thread_data_offset
,
const
index_t
dst_thread_data_offset
,
const
bool
dst_thread_data_valid
,
const
bool
dst_thread_data_valid
,
const
index_t
dst_data_range
);
const
index_t
dst_data_range
);
template
<
>
template
<
>
__device__
float
amd_buffer_load_v2
<
float
,
1
>
(
const
float
*
p_src_wave
,
__device__
float
amd_buffer_load_v2
<
float
,
1
>
(
const
float
*
p_src_wave
,
...
...
composable_kernel/include/utility/float_type.amd.hpp.in
View file @
ca8ba252
...
@@ -175,7 +175,7 @@ struct vector_type;
...
@@ -175,7 +175,7 @@ struct vector_type;
template <typename T>
template <typename T>
struct vector_type<T, 1>
struct vector_type<T, 1>
{
{
using
MemoryT
ype = T;
using
t
ype = T;
union
union
{
{
...
@@ -206,7 +206,7 @@ struct vector_type<T, 2>
...
@@ -206,7 +206,7 @@ struct vector_type<T, 2>
using d1_t = T;
using d1_t = T;
typedef T d2_t __attribute__((ext_vector_type(2)));
typedef T d2_t __attribute__((ext_vector_type(2)));
using
MemoryT
ype = d2_t;
using
t
ype = d2_t;
union
union
{
{
...
@@ -243,7 +243,7 @@ struct vector_type<T, 4>
...
@@ -243,7 +243,7 @@ struct vector_type<T, 4>
typedef T d2_t __attribute__((ext_vector_type(2)));
typedef T d2_t __attribute__((ext_vector_type(2)));
typedef T d4_t __attribute__((ext_vector_type(4)));
typedef T d4_t __attribute__((ext_vector_type(4)));
using
MemoryT
ype = d4_t;
using
t
ype = d4_t;
union
union
{
{
...
@@ -286,7 +286,7 @@ struct vector_type<T, 8>
...
@@ -286,7 +286,7 @@ struct vector_type<T, 8>
typedef T d4_t __attribute__((ext_vector_type(4)));
typedef T d4_t __attribute__((ext_vector_type(4)));
typedef T d8_t __attribute__((ext_vector_type(8)));
typedef T d8_t __attribute__((ext_vector_type(8)));
using
MemoryT
ype = d8_t;
using
t
ype = d8_t;
union
union
{
{
...
...
composable_kernel/include/utility/float_type.nvidia.hpp.in
View file @
ca8ba252
...
@@ -32,16 +32,16 @@ struct vector_type
...
@@ -32,16 +32,16 @@ struct vector_type
typedef struct
typedef struct
{
{
T scalar[N];
T scalar[N];
}
MemoryT
ype;
}
t
ype;
};
};
template <>
template <>
struct vector_type<float, 1>
struct vector_type<float, 1>
{
{
using
MemoryT
ype = float;
using
t
ype = float;
template <index_t I>
template <index_t I>
__host__ __device__ static void SetScalar(
MemoryT
ype& v, float s, Number<I>)
__host__ __device__ static void SetScalar(
t
ype& v, float s, Number<I>)
{
{
static_assert(I < 1, "wrong");
static_assert(I < 1, "wrong");
*(reinterpret_cast<float*>(&v) + I) = s;
*(reinterpret_cast<float*>(&v) + I) = s;
...
@@ -51,22 +51,22 @@ struct vector_type<float, 1>
...
@@ -51,22 +51,22 @@ struct vector_type<float, 1>
template <>
template <>
struct vector_type<float, 2>
struct vector_type<float, 2>
{
{
using
MemoryT
ype = float2_t;
using
t
ype = float2_t;
union DataType
union DataType
{
{
MemoryT
ype vector;
t
ype vector;
float scalar[2];
float scalar[2];
};
};
template <index_t I>
template <index_t I>
__host__ __device__ static void SetScalar(
MemoryT
ype& v, float s, Number<I>)
__host__ __device__ static void SetScalar(
t
ype& v, float s, Number<I>)
{
{
static_assert(I < 2, "wrong");
static_assert(I < 2, "wrong");
*(reinterpret_cast<float*>(&v) + I) = s;
*(reinterpret_cast<float*>(&v) + I) = s;
}
}
__host__ __device__ static
MemoryT
ype Pack(float s0, float s1)
__host__ __device__ static
t
ype Pack(float s0, float s1)
{
{
DataType data;
DataType data;
data.scalar[0] = s0;
data.scalar[0] = s0;
...
@@ -78,12 +78,12 @@ struct vector_type<float, 2>
...
@@ -78,12 +78,12 @@ struct vector_type<float, 2>
template <>
template <>
struct vector_type<float, 4>
struct vector_type<float, 4>
{
{
using
MemoryT
ype = float4_t;
using
t
ype = float4_t;
__host__ __device__ static constexpr index_t GetSize() { return 4; }
__host__ __device__ static constexpr index_t GetSize() { return 4; }
template <index_t I>
template <index_t I>
__host__ __device__ static void SetScalar(
MemoryT
ype& v, float s, Number<I>)
__host__ __device__ static void SetScalar(
t
ype& v, float s, Number<I>)
{
{
static_assert(I < 4, "wrong");
static_assert(I < 4, "wrong");
*(reinterpret_cast<float*>(&v) + I) = s;
*(reinterpret_cast<float*>(&v) + I) = s;
...
@@ -93,10 +93,10 @@ struct vector_type<float, 4>
...
@@ -93,10 +93,10 @@ struct vector_type<float, 4>
template <>
template <>
struct vector_type<half_t, 1>
struct vector_type<half_t, 1>
{
{
using
MemoryT
ype = half_t;
using
t
ype = half_t;
template <index_t I>
template <index_t I>
__host__ __device__ static void SetScalar(
MemoryT
ype& v, half_t s, Number<I>)
__host__ __device__ static void SetScalar(
t
ype& v, half_t s, Number<I>)
{
{
static_assert(I < 1, "wrong");
static_assert(I < 1, "wrong");
*(reinterpret_cast<half_t*>(&v) + I) = s;
*(reinterpret_cast<half_t*>(&v) + I) = s;
...
@@ -106,22 +106,22 @@ struct vector_type<half_t, 1>
...
@@ -106,22 +106,22 @@ struct vector_type<half_t, 1>
template <>
template <>
struct vector_type<half_t, 2>
struct vector_type<half_t, 2>
{
{
using
MemoryT
ype = half2_t;
using
t
ype = half2_t;
union DataType
union DataType
{
{
MemoryT
ype vector;
t
ype vector;
half_t scalar[2];
half_t scalar[2];
};
};
template <index_t I>
template <index_t I>
__host__ __device__ static void SetScalar(
MemoryT
ype& v, half_t s, Number<I>)
__host__ __device__ static void SetScalar(
t
ype& v, half_t s, Number<I>)
{
{
static_assert(I < 2, "wrong");
static_assert(I < 2, "wrong");
*(reinterpret_cast<half_t*>(&v) + I) = s;
*(reinterpret_cast<half_t*>(&v) + I) = s;
}
}
__host__ __device__ static
MemoryT
ype Pack(half_t s0, half_t s1)
__host__ __device__ static
t
ype Pack(half_t s0, half_t s1)
{
{
DataType data;
DataType data;
data.scalar[0] = s0;
data.scalar[0] = s0;
...
...
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
View file @
ca8ba252
...
@@ -44,7 +44,7 @@ __device__ void atomic_add_impl<float4_t>(float4_t* p_dst, float4_t src)
...
@@ -44,7 +44,7 @@ __device__ void atomic_add_impl<float4_t>(float4_t* p_dst, float4_t src)
template <typename T, index_t DataPerAccess>
template <typename T, index_t DataPerAccess>
struct SetData
struct SetData
{
{
using vector_t = typename vector_type<T, DataPerAccess>::
MemoryT
ype;
using vector_t = typename vector_type<T, DataPerAccess>::
t
ype;
// This version is only for compatibility, don't use this version if possible
// This version is only for compatibility, don't use this version if possible
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
...
@@ -122,7 +122,7 @@ struct SetData
...
@@ -122,7 +122,7 @@ struct SetData
template <typename T, index_t DataPerAccess>
template <typename T, index_t DataPerAccess>
struct AtomicAddData
struct AtomicAddData
{
{
using vector_t = typename vector_type<T, DataPerAccess>::
MemoryT
ype;
using vector_t = typename vector_type<T, DataPerAccess>::
t
ype;
// This version is only for compatibility, don't use this version if possible
// This version is only for compatibility, don't use this version if possible
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
...
...
composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in
View file @
ca8ba252
...
@@ -37,7 +37,7 @@ __device__ void atomic_add_impl<float4_t>(float4_t* p_dst, float4_t src)
...
@@ -37,7 +37,7 @@ __device__ void atomic_add_impl<float4_t>(float4_t* p_dst, float4_t src)
template <typename T, index_t DataPerAccess>
template <typename T, index_t DataPerAccess>
struct SetData
struct SetData
{
{
using vector_t = typename vector_type<T, DataPerAccess>::
MemoryT
ype;
using vector_t = typename vector_type<T, DataPerAccess>::
t
ype;
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
__device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
__device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
...
@@ -50,7 +50,7 @@ struct SetData
...
@@ -50,7 +50,7 @@ struct SetData
template <typename T, index_t DataPerAccess>
template <typename T, index_t DataPerAccess>
struct AtomicAddData
struct AtomicAddData
{
{
using vector_t = typename vector_type<T, DataPerAccess>::
MemoryT
ype;
using vector_t = typename vector_type<T, DataPerAccess>::
t
ype;
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
__device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
__device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment