Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
562e1e27
Unverified
Commit
562e1e27
authored
Nov 04, 2019
by
Chao Liu
Committed by
GitHub
Nov 04, 2019
Browse files
MIOpen integration: recent bug fixes from MIOpen (#5)
parent
52c3fe05
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
415 additions
and
97 deletions
+415
-97
composable_kernel/include/tensor_operation/threadwise_gemm.hpp
...sable_kernel/include/tensor_operation/threadwise_gemm.hpp
+10
-10
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+4
-4
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
...ation/threadwise_generic_tensor_slice_copy_deprecated.hpp
+4
-4
composable_kernel/include/utility/amd_buffer_addressing.hpp
composable_kernel/include/utility/amd_buffer_addressing.hpp
+34
-33
composable_kernel/include/utility/amd_inline_asm.hpp
composable_kernel/include/utility/amd_inline_asm.hpp
+42
-40
composable_kernel/include/utility/common_header.hpp
composable_kernel/include/utility/common_header.hpp
+4
-0
composable_kernel/include/utility/config.amd.hpp.in
composable_kernel/include/utility/config.amd.hpp.in
+2
-2
composable_kernel/include/utility/float_type.hpp
composable_kernel/include/utility/float_type.hpp
+311
-0
driver/src/driver.cpp
driver/src/driver.cpp
+4
-4
No files found.
composable_kernel/include/tensor_operation/threadwise_gemm.hpp
View file @
562e1e27
...
@@ -114,7 +114,7 @@ struct ThreadwiseGemmTransANormalBNormalC
...
@@ -114,7 +114,7 @@ struct ThreadwiseGemmTransANormalBNormalC
const
index_t
cindex_0
=
MatrixC
::
CalculateOffset
(
m
,
0
);
const
index_t
cindex_0
=
MatrixC
::
CalculateOffset
(
m
,
0
);
const
index_t
cindex_1
=
MatrixC
::
CalculateOffset
(
m
,
1
);
const
index_t
cindex_1
=
MatrixC
::
CalculateOffset
(
m
,
1
);
_
_outer_product_1x2
(
amd_assembly
_outer_product_1x2
(
p_a
[
aindex
],
p_b
[
bindex_0
],
p_b
[
bindex_1
],
p_c
[
cindex_0
],
p_c
[
cindex_1
]);
p_a
[
aindex
],
p_b
[
bindex_0
],
p_b
[
bindex_1
],
p_c
[
cindex_0
],
p_c
[
cindex_1
]);
});
});
...
@@ -129,7 +129,7 @@ struct ThreadwiseGemmTransANormalBNormalC
...
@@ -129,7 +129,7 @@ struct ThreadwiseGemmTransANormalBNormalC
const
index_t
cindex_2
=
MatrixC
::
CalculateOffset
(
m
,
2
);
const
index_t
cindex_2
=
MatrixC
::
CalculateOffset
(
m
,
2
);
const
index_t
cindex_3
=
MatrixC
::
CalculateOffset
(
m
,
3
);
const
index_t
cindex_3
=
MatrixC
::
CalculateOffset
(
m
,
3
);
_
_outer_product_1x4
(
p_a
[
aindex
],
amd_assembly
_outer_product_1x4
(
p_a
[
aindex
],
p_b
[
bindex_0
],
p_b
[
bindex_0
],
p_b
[
bindex_1
],
p_b
[
bindex_1
],
p_b
[
bindex_2
],
p_b
[
bindex_2
],
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
562e1e27
...
@@ -123,7 +123,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -123,7 +123,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
static_if
<
SrcAddressSpace
==
AddressSpace
::
global
>
{}([
&
](
auto
fwd
)
{
static_if
<
SrcAddressSpace
==
AddressSpace
::
global
>
{}([
&
](
auto
fwd
)
{
#if CK_USE_AMD_BUFFER_ADDRESSING
#if CK_USE_AMD_BUFFER_ADDRESSING
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
_
_buffer_load
<
SrcData
,
SrcDataPerAccess
>
(
amd_intrinsic
_buffer_load
<
SrcData
,
SrcDataPerAccess
>
(
fwd
(
p_src
),
src_coord
.
GetOffset
(),
0
);
fwd
(
p_src
),
src_coord
.
GetOffset
(),
0
);
#else
#else
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
...
@@ -162,7 +162,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -162,7 +162,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
{
{
static_if
<
DstAddressSpace
==
AddressSpace
::
global
>
{}([
&
](
auto
fwd
)
{
static_if
<
DstAddressSpace
==
AddressSpace
::
global
>
{}([
&
](
auto
fwd
)
{
#if CK_USE_AMD_BUFFER_ADDRESSING
#if CK_USE_AMD_BUFFER_ADDRESSING
_
_buffer_store
<
DstData
,
DstDataPerAccess
>
(
amd_intrinsic
_buffer_store
<
DstData
,
DstDataPerAccess
>
(
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]),
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]),
fwd
(
p_dst
),
fwd
(
p_dst
),
dst_coord
.
GetOffset
(),
dst_coord
.
GetOffset
(),
...
@@ -311,7 +311,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -311,7 +311,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
static_if
<
SrcAddressSpace
==
AddressSpace
::
global
>
{}([
&
](
auto
)
{
static_if
<
SrcAddressSpace
==
AddressSpace
::
global
>
{}([
&
](
auto
)
{
#if CK_USE_AMD_BUFFER_ADDRESSING
#if CK_USE_AMD_BUFFER_ADDRESSING
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
_
_buffer_load
<
SrcData
,
SrcDataPerAccess
>
(
amd_intrinsic
_buffer_load
<
SrcData
,
SrcDataPerAccess
>
(
p_src
,
src_nonlinear_coord
.
GetOffset
(),
src_linear_offset
);
p_src
,
src_nonlinear_coord
.
GetOffset
(),
src_linear_offset
);
#else
#else
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
...
@@ -503,7 +503,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -503,7 +503,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
{
{
static_if
<
DstAddressSpace
==
AddressSpace
::
global
>
{}([
&
](
auto
)
{
static_if
<
DstAddressSpace
==
AddressSpace
::
global
>
{}([
&
](
auto
)
{
#if CK_USE_AMD_BUFFER_ADDRESSING
#if CK_USE_AMD_BUFFER_ADDRESSING
_
_buffer_store
<
DstData
,
DstDataPerAccess
>
(
amd_intrinsic
_buffer_store
<
DstData
,
DstDataPerAccess
>
(
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]),
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]),
p_dst
,
p_dst
,
dst_nonlinear_coord
.
GetOffset
(),
dst_nonlinear_coord
.
GetOffset
(),
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
View file @
562e1e27
...
@@ -335,7 +335,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
...
@@ -335,7 +335,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
// 3. src_merged_offset can be runtime value (no assumption imposed)
// 3. src_merged_offset can be runtime value (no assumption imposed)
static_if
<
SrcAddressSpace
==
AddressSpace
::
global
>
{}([
&
](
auto
fwd
)
{
static_if
<
SrcAddressSpace
==
AddressSpace
::
global
>
{}([
&
](
auto
fwd
)
{
#if CK_USE_AMD_BUFFER_ADDRESSING
#if CK_USE_AMD_BUFFER_ADDRESSING
vector_data
=
_
_buffer_load
<
SrcData
,
SrcDataPerAccess
>
(
vector_data
=
amd_intrinsic
_buffer_load
<
SrcData
,
SrcDataPerAccess
>
(
fwd
(
p_src
),
src_merged_offset
,
src_normal_offset
);
fwd
(
p_src
),
src_merged_offset
,
src_normal_offset
);
#else
#else
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
...
@@ -375,7 +375,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
...
@@ -375,7 +375,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
// copy data from buffer into dst
// copy data from buffer into dst
{
{
using
dst_vector_t
=
typename
vector_type
<
Src
Data
,
DstDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
Dst
Data
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
...
@@ -420,7 +420,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
...
@@ -420,7 +420,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_merged_dim_data_id
+
dst_normal_dim_data_id
+
scalar_id
);
dst_merged_dim_data_id
+
dst_normal_dim_data_id
+
scalar_id
);
reinterpret_cast
<
Src
Data
*>
(
&
vector_data
)[
i
]
=
p_dst_buffer
[
buffer_offset
];
reinterpret_cast
<
Dst
Data
*>
(
&
vector_data
)[
i
]
=
p_dst_buffer
[
buffer_offset
];
}
}
// offset w.r.t. normal dimension is known at compile-time
// offset w.r.t. normal dimension is known at compile-time
...
@@ -444,7 +444,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
...
@@ -444,7 +444,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
// 3. dst_merged_offset can be runtime value (no assumption imposed)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
static_if
<
DstAddressSpace
==
AddressSpace
::
global
>
{}([
&
](
auto
fwd
)
{
static_if
<
DstAddressSpace
==
AddressSpace
::
global
>
{}([
&
](
auto
fwd
)
{
#if CK_USE_AMD_BUFFER_ADDRESSING
#if CK_USE_AMD_BUFFER_ADDRESSING
_
_buffer_store
<
Src
Data
,
DstDataPerAccess
>
(
amd_intrinsic
_buffer_store
<
Dst
Data
,
DstDataPerAccess
>
(
vector_data
,
fwd
(
p_dst
),
dst_merged_offset
,
dst_normal_offset
);
vector_data
,
fwd
(
p_dst
),
dst_merged_offset
,
dst_normal_offset
);
#else
#else
*
reinterpret_cast
<
dst_vector_t
*>
(
*
reinterpret_cast
<
dst_vector_t
*>
(
...
...
composable_kernel/include/utility/amd_buffer_addressing.hpp
View file @
562e1e27
...
@@ -19,53 +19,54 @@ __device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc,
...
@@ -19,53 +19,54 @@ __device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
bool
glc
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load"
);
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load
.f32
"
);
__device__
float2_t
__llvm_amdgcn_buffer_loadx2
(
int32x4_t
rsrc
,
__device__
float2_t
__llvm_amdgcn_buffer_loadx2
(
int32x4_t
rsrc
,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
bool
glc
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.
dwordx
2"
);
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.
v2f3
2"
);
__device__
float4_t
__llvm_amdgcn_buffer_loadx4
(
int32x4_t
rsrc
,
__device__
float4_t
__llvm_amdgcn_buffer_loadx4
(
int32x4_t
rsrc
,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
bool
glc
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.
dwordx4
"
);
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.
v4f32
"
);
__device__
void
__llvm_amdgcn_buffer_store
(
float
vdata
,
__device__
void
__llvm_amdgcn_buffer_store
(
float
vdata
,
int32x4_t
rsrc
,
int32x4_t
rsrc
,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
bool
glc
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store"
);
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store
.f32
"
);
__device__
void
__llvm_amdgcn_buffer_storex2
(
float2_t
vdata
,
__device__
void
__llvm_amdgcn_buffer_storex2
(
float2_t
vdata
,
int32x4_t
rsrc
,
int32x4_t
rsrc
,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
bool
glc
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.
dwordx
2"
);
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.
v2f3
2"
);
__device__
void
__llvm_amdgcn_buffer_storex4
(
float4_t
vdata
,
__device__
void
__llvm_amdgcn_buffer_storex4
(
float4_t
vdata
,
int32x4_t
rsrc
,
int32x4_t
rsrc
,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
bool
glc
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.
dwordx4
"
);
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.
v4f32
"
);
template
<
typename
T
,
index_t
VectorSize
>
template
<
typename
T
,
index_t
VectorSize
>
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
amd_intrinsic_buffer_load
(
__buffer_load
(
const
T
*
p_src_block
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
);
const
T
*
p_src_block
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
);
template
<
typename
T
,
index_t
VectorSize
>
template
<
typename
T
,
index_t
VectorSize
>
__device__
void
__buffer_store
(
const
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
&
src
,
__device__
void
amd_intrinsic_buffer_store
(
const
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
&
src
,
T
*
p_dst_block
,
T
*
p_dst_block
,
index_t
dst_thread_data_offset
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
);
index_t
dst_const_data_offset
);
template
<
>
template
<
>
__device__
float
_
_buffer_load
<
float
,
1
>
(
const
float
*
p_src_block
,
__device__
float
amd_intrinsic
_buffer_load
<
float
,
1
>
(
const
float
*
p_src_block
,
index_t
src_thread_data_offset
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
)
index_t
src_const_data_offset
)
{
{
...
@@ -100,7 +101,7 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
...
@@ -100,7 +101,7 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
}
}
template
<
>
template
<
>
__device__
float2_t
_
_buffer_load
<
float
,
2
>
(
const
float
*
p_src_block
,
__device__
float2_t
amd_intrinsic
_buffer_load
<
float
,
2
>
(
const
float
*
p_src_block
,
index_t
src_thread_data_offset
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
)
index_t
src_const_data_offset
)
{
{
...
@@ -135,7 +136,7 @@ __device__ float2_t __buffer_load<float, 2>(const float* p_src_block,
...
@@ -135,7 +136,7 @@ __device__ float2_t __buffer_load<float, 2>(const float* p_src_block,
}
}
template
<
>
template
<
>
__device__
float4_t
_
_buffer_load
<
float
,
4
>
(
const
float
*
p_src_block
,
__device__
float4_t
amd_intrinsic
_buffer_load
<
float
,
4
>
(
const
float
*
p_src_block
,
index_t
src_thread_data_offset
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
)
index_t
src_const_data_offset
)
{
{
...
@@ -170,7 +171,7 @@ __device__ float4_t __buffer_load<float, 4>(const float* p_src_block,
...
@@ -170,7 +171,7 @@ __device__ float4_t __buffer_load<float, 4>(const float* p_src_block,
}
}
template
<
>
template
<
>
__device__
void
_
_buffer_store
<
float
,
1
>
(
const
float
&
src
,
__device__
void
amd_intrinsic
_buffer_store
<
float
,
1
>
(
const
float
&
src
,
float
*
p_dst_block
,
float
*
p_dst_block
,
index_t
dst_thread_data_offset
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
)
index_t
dst_const_data_offset
)
...
@@ -207,7 +208,7 @@ __device__ void __buffer_store<float, 1>(const float& src,
...
@@ -207,7 +208,7 @@ __device__ void __buffer_store<float, 1>(const float& src,
}
}
template
<
>
template
<
>
__device__
void
_
_buffer_store
<
float
,
2
>
(
const
float2_t
&
src
,
__device__
void
amd_intrinsic
_buffer_store
<
float
,
2
>
(
const
float2_t
&
src
,
float
*
p_dst_block
,
float
*
p_dst_block
,
index_t
dst_thread_data_offset
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
)
index_t
dst_const_data_offset
)
...
@@ -244,7 +245,7 @@ __device__ void __buffer_store<float, 2>(const float2_t& src,
...
@@ -244,7 +245,7 @@ __device__ void __buffer_store<float, 2>(const float2_t& src,
}
}
template
<
>
template
<
>
__device__
void
_
_buffer_store
<
float
,
4
>
(
const
float4_t
&
src
,
__device__
void
amd_intrinsic
_buffer_store
<
float
,
4
>
(
const
float4_t
&
src
,
float
*
p_dst_block
,
float
*
p_dst_block
,
index_t
dst_thread_data_offset
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
)
index_t
dst_const_data_offset
)
...
...
composable_kernel/include/utility/amd_inline_asm.hpp
View file @
562e1e27
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
namespace
ck
{
namespace
ck
{
// outer-product: c[i,j] += inner_product(a[i], b[j])
// outer-product: c[i,j] += inner_product(a[i], b[j])
__device__
void
_
_outer_product_1x2
(
float
a
,
float
b0
,
float
b1
,
float
&
c0
,
float
&
c1
)
__device__
void
amd_assembly
_outer_product_1x2
(
float
a
,
float
b0
,
float
b1
,
float
&
c0
,
float
&
c1
)
{
{
// disable inline asm due to the compiler issue: SWDEV-202749
// disable inline asm due to the compiler issue: SWDEV-202749
///\to-do: enable the inline asm after the compiler fix
///\to-do: enable the inline asm after the compiler fix
...
@@ -24,7 +24,7 @@ __device__ void __outer_product_1x2(float a, float b0, float b1, float& c0, floa
...
@@ -24,7 +24,7 @@ __device__ void __outer_product_1x2(float a, float b0, float b1, float& c0, floa
}
}
// outer-product: c[i,j] += inner_product(a[i], b[j])
// outer-product: c[i,j] += inner_product(a[i], b[j])
__device__
void
_
_outer_product_1x4
(
__device__
void
amd_assembly
_outer_product_1x4
(
float
a
,
float
b0
,
float
b1
,
float
b2
,
float
b3
,
float
&
c0
,
float
&
c1
,
float
&
c2
,
float
&
c3
)
float
a
,
float
b0
,
float
b1
,
float
b2
,
float
b3
,
float
&
c0
,
float
&
c1
,
float
&
c2
,
float
&
c3
)
{
{
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
...
@@ -38,11 +38,12 @@ __device__ void __outer_product_1x4(
...
@@ -38,11 +38,12 @@ __device__ void __outer_product_1x4(
}
}
// outer-product: c[i,j] += inner_product(a[i], b[j])
// outer-product: c[i,j] += inner_product(a[i], b[j])
__device__
void
__outer_product_1x2
(
half2_t
a
,
half2_t
b0
,
half2_t
b1
,
float
&
c0
,
float
&
c1
)
__device__
void
amd_assembly_outer_product_1x2
(
half2_t
a
,
half2_t
b0
,
half2_t
b1
,
float
&
c0
,
float
&
c1
)
{
{
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
v_dot2_f32_f16 %0, %2, %3
%0
\n
\
v_dot2_f32_f16 %0, %2, %3
,
%0
\n
\
v_dot2_f32_f16 %1, %2, %4
%1
\n
\
v_dot2_f32_f16 %1, %2, %4
,
%1
\n
\
"
"
:
"=v"
(
c0
),
"=v"
(
c1
)
// Dest registers
:
"=v"
(
c0
),
"=v"
(
c1
)
// Dest registers
:
"v"
(
a
),
// 1st Src register for 1 half2 registers
:
"v"
(
a
),
// 1st Src register for 1 half2 registers
...
@@ -53,7 +54,8 @@ __device__ void __outer_product_1x2(half2_t a, half2_t b0, half2_t b1, float& c0
...
@@ -53,7 +54,8 @@ __device__ void __outer_product_1x2(half2_t a, half2_t b0, half2_t b1, float& c0
}
}
// outer-product: c[i,j] += inner_product(a[i], b[j])
// outer-product: c[i,j] += inner_product(a[i], b[j])
__device__
void
__outer_product_1x2
(
half4_t
a
,
half4_t
b0
,
half4_t
b1
,
float
&
c0
,
float
&
c1
)
__device__
void
amd_assembly_outer_product_1x2
(
half4_t
a
,
half4_t
b0
,
half4_t
b1
,
float
&
c0
,
float
&
c1
)
{
{
const
half2_t
*
p_a_half2
=
reinterpret_cast
<
const
half2_t
*>
(
&
a
);
const
half2_t
*
p_a_half2
=
reinterpret_cast
<
const
half2_t
*>
(
&
a
);
const
half2_t
*
p_b0_half2
=
reinterpret_cast
<
const
half2_t
*>
(
&
b0
);
const
half2_t
*
p_b0_half2
=
reinterpret_cast
<
const
half2_t
*>
(
&
b0
);
...
@@ -61,10 +63,10 @@ __device__ void __outer_product_1x2(half4_t a, half4_t b0, half4_t b1, float& c0
...
@@ -61,10 +63,10 @@ __device__ void __outer_product_1x2(half4_t a, half4_t b0, half4_t b1, float& c0
// do dot2 two times
// do dot2 two times
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
v_dot2_f32_f16 %0, %2, %4
%0
\n
\
v_dot2_f32_f16 %0, %2, %4
,
%0
\n
\
v_dot2_f32_f16 %1, %2, %6
%1
\n
\
v_dot2_f32_f16 %1, %2, %6
,
%1
\n
\
v_dot2_f32_f16 %0, %3, %5
%0
\n
\
v_dot2_f32_f16 %0, %3, %5
,
%0
\n
\
v_dot2_f32_f16 %1, %3, %7
%1
\n
\
v_dot2_f32_f16 %1, %3, %7
,
%1
\n
\
"
"
:
"=v"
(
c0
),
"=v"
(
c1
)
// Dest registers
:
"=v"
(
c0
),
"=v"
(
c1
)
// Dest registers
:
"v"
(
p_a_half2
[
0
]),
:
"v"
(
p_a_half2
[
0
]),
...
@@ -78,7 +80,7 @@ __device__ void __outer_product_1x2(half4_t a, half4_t b0, half4_t b1, float& c0
...
@@ -78,7 +80,7 @@ __device__ void __outer_product_1x2(half4_t a, half4_t b0, half4_t b1, float& c0
}
}
// outer-product: c[i,j] += inner_product(a[i], b[j])
// outer-product: c[i,j] += inner_product(a[i], b[j])
__device__
void
_
_outer_product_1x4
(
half2_t
a
,
__device__
void
amd_assembly
_outer_product_1x4
(
half2_t
a
,
half2_t
b0
,
half2_t
b0
,
half2_t
b1
,
half2_t
b1
,
half2_t
b2
,
half2_t
b2
,
...
@@ -89,10 +91,10 @@ __device__ void __outer_product_1x4(half2_t a,
...
@@ -89,10 +91,10 @@ __device__ void __outer_product_1x4(half2_t a,
float
&
c3
)
float
&
c3
)
{
{
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
v_dot2_f32_f16 %0, %4, %5
%0
\n
\
v_dot2_f32_f16 %0, %4, %5
,
%0
\n
\
v_dot2_f32_f16 %1, %4, %6
%1
\n
\
v_dot2_f32_f16 %1, %4, %6
,
%1
\n
\
v_dot2_f32_f16 %2, %4, %7
%2
\n
\
v_dot2_f32_f16 %2, %4, %7
,
%2
\n
\
v_dot2_f32_f16 %3, %4, %8
%3
\n
\
v_dot2_f32_f16 %3, %4, %8
,
%3
\n
\
"
"
:
"=v"
(
c0
),
"=v"
(
c1
),
"=v"
(
c2
),
"=v"
(
c3
)
// Dest registers
:
"=v"
(
c0
),
"=v"
(
c1
),
"=v"
(
c2
),
"=v"
(
c3
)
// Dest registers
:
"v"
(
a
),
// 1st Src register for 1 half2 registers
:
"v"
(
a
),
// 1st Src register for 1 half2 registers
...
@@ -107,7 +109,7 @@ __device__ void __outer_product_1x4(half2_t a,
...
@@ -107,7 +109,7 @@ __device__ void __outer_product_1x4(half2_t a,
}
}
// outer-product: c[i,j] += inner_product(a[i], b[j])
// outer-product: c[i,j] += inner_product(a[i], b[j])
__device__
void
_
_outer_product_1x4
(
half4_t
a
,
__device__
void
amd_assembly
_outer_product_1x4
(
half4_t
a
,
half4_t
b0
,
half4_t
b0
,
half4_t
b1
,
half4_t
b1
,
half4_t
b2
,
half4_t
b2
,
...
@@ -125,14 +127,14 @@ __device__ void __outer_product_1x4(half4_t a,
...
@@ -125,14 +127,14 @@ __device__ void __outer_product_1x4(half4_t a,
// do dot2 two times
// do dot2 two times
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
v_dot2_f32_f16 %0, %4, %6 %0
\n
\
v_dot2_f32_f16 %0, %4, %6
,
%0
\n
\
v_dot2_f32_f16 %1, %4, %8 %1
\n
\
v_dot2_f32_f16 %1, %4, %8
,
%1
\n
\
v_dot2_f32_f16 %2, %4, %10 %2
\n
\
v_dot2_f32_f16 %2, %4, %10
,
%2
\n
\
v_dot2_f32_f16 %3, %4, %12 %3
\n
\
v_dot2_f32_f16 %3, %4, %12
,
%3
\n
\
v_dot2_f32_f16 %0, %5, %7 %0
\n
\
v_dot2_f32_f16 %0, %5, %7
,
%0
\n
\
v_dot2_f32_f16 %1, %5, %9 %1
\n
\
v_dot2_f32_f16 %1, %5, %9
,
%1
\n
\
v_dot2_f32_f16 %2, %5, %11 %2
\n
\
v_dot2_f32_f16 %2, %5, %11
,
%2
\n
\
v_dot2_f32_f16 %3, %5, %13 %3
\n
\
v_dot2_f32_f16 %3, %5, %13
,
%3
\n
\
"
"
:
"=v"
(
c0
),
"=v"
(
c1
),
"=v"
(
c2
),
"=v"
(
c3
)
// Dest registers
:
"=v"
(
c0
),
"=v"
(
c1
),
"=v"
(
c2
),
"=v"
(
c3
)
// Dest registers
:
"v"
(
p_a_half2
[
0
]),
:
"v"
(
p_a_half2
[
0
]),
...
...
composable_kernel/include/utility/common_header.hpp
View file @
562e1e27
...
@@ -24,4 +24,8 @@
...
@@ -24,4 +24,8 @@
#include "amd_buffer_addressing.hpp"
#include "amd_buffer_addressing.hpp"
#endif
#endif
#if CK_USE_AMD_XDLOPS
#include "amd_xdlops.hpp"
#endif
#endif
#endif
composable_kernel/include/utility/config.amd.hpp.in
View file @
562e1e27
...
@@ -31,11 +31,11 @@
...
@@ -31,11 +31,11 @@
// AMD XDLOPS
// AMD XDLOPS
#ifndef CK_USE_AMD_XDLOPS
#ifndef CK_USE_AMD_XDLOPS
#define CK_USE_AMD_XDLOPS
1
#define CK_USE_AMD_XDLOPS
0
#endif
#endif
#ifndef CK_USE_AMD_XDLOPS_INLINE_ASM
#ifndef CK_USE_AMD_XDLOPS_INLINE_ASM
#define CK_USE_AMD_XDLOPS_INLINE_ASM
1
#define CK_USE_AMD_XDLOPS_INLINE_ASM
0
#endif
#endif
// experimental implementation
// experimental implementation
...
...
composable_kernel/include/utility/float_type.hpp
0 → 100644
View file @
562e1e27
#ifndef CK_FLOAT_TYPE_AMD_HPP
#define CK_FLOAT_TYPE_AMD_HPP
namespace
ck
{
// For some reason, HIP compiler need this definition to generate optimal ISA
// float
typedef
float
float2_t
__attribute__
((
ext_vector_type
(
2
)));
typedef
float
float4_t
__attribute__
((
ext_vector_type
(
4
)));
typedef
float
float16_t
__attribute__
((
ext_vector_type
(
16
)));
typedef
float
float32_t
__attribute__
((
ext_vector_type
(
32
)));
// float16
typedef
_Float16
half2_t
__attribute__
((
ext_vector_type
(
2
)));
typedef
_Float16
half4_t
__attribute__
((
ext_vector_type
(
4
)));
// bfloat16
typedef
ushort
ushort2_t
__attribute__
((
ext_vector_type
(
2
)));
typedef
ushort
ushort4_t
__attribute__
((
ext_vector_type
(
4
)));
template
<
class
T
,
index_t
N
>
struct
vector_type
{
typedef
struct
{
T
scalar
[
N
];
}
MemoryType
;
};
template
<
>
struct
vector_type
<
float
,
1
>
{
using
MemoryType
=
float
;
template
<
index_t
I
>
__host__
__device__
static
void
SetScalar
(
MemoryType
&
v
,
float
s
,
Number
<
I
>
)
{
static_assert
(
I
<
1
,
"wrong"
);
*
(
reinterpret_cast
<
float
*>
(
&
v
)
+
I
)
=
s
;
}
};
template
<
>
struct
vector_type
<
float
,
2
>
{
using
MemoryType
=
float2_t
;
union
DataType
{
MemoryType
vector
;
float
scalar
[
2
];
};
template
<
index_t
I
>
__host__
__device__
static
void
SetScalar
(
MemoryType
&
v
,
float
s
,
Number
<
I
>
)
{
static_assert
(
I
<
2
,
"wrong"
);
*
(
reinterpret_cast
<
float
*>
(
&
v
)
+
I
)
=
s
;
}
__host__
__device__
static
MemoryType
Pack
(
float
s0
,
float
s1
)
{
DataType
data
;
data
.
scalar
[
0
]
=
s0
;
data
.
scalar
[
1
]
=
s1
;
return
data
.
vector
;
}
};
template
<
>
struct
vector_type
<
float
,
4
>
{
using
MemoryType
=
float4_t
;
__host__
__device__
static
constexpr
index_t
GetSize
()
{
return
4
;
}
template
<
index_t
I
>
__host__
__device__
static
void
SetScalar
(
MemoryType
&
v
,
float
s
,
Number
<
I
>
)
{
static_assert
(
I
<
4
,
"wrong"
);
*
(
reinterpret_cast
<
float
*>
(
&
v
)
+
I
)
=
s
;
}
};
template
<
>
struct
vector_type
<
half
,
1
>
{
using
MemoryType
=
half
;
template
<
index_t
I
>
__host__
__device__
static
void
SetScalar
(
MemoryType
&
v
,
half
s
,
Number
<
I
>
)
{
static_assert
(
I
<
1
,
"wrong"
);
*
(
reinterpret_cast
<
half
*>
(
&
v
)
+
I
)
=
s
;
}
};
template
<
>
struct
vector_type
<
half
,
2
>
{
using
MemoryType
=
half2_t
;
union
DataType
{
MemoryType
vector
;
half
scalar
[
2
];
};
template
<
index_t
I
>
__host__
__device__
static
void
SetScalar
(
MemoryType
&
v
,
half
s
,
Number
<
I
>
)
{
static_assert
(
I
<
2
,
"wrong"
);
*
(
reinterpret_cast
<
half
*>
(
&
v
)
+
I
)
=
s
;
}
__host__
__device__
static
MemoryType
Pack
(
half
s0
,
half
s1
)
{
DataType
data
;
data
.
scalar
[
0
]
=
s0
;
data
.
scalar
[
1
]
=
s1
;
return
data
.
vector
;
}
};
template
<
>
struct
vector_type
<
half
,
4
>
{
using
MemoryType
=
half4_t
;
union
DataType
{
MemoryType
vector
;
half
scalar
[
4
];
};
template
<
index_t
I
>
__host__
__device__
static
void
SetScalar
(
MemoryType
&
v
,
half
s
,
Number
<
I
>
)
{
static_assert
(
I
<
4
,
"wrong"
);
*
(
reinterpret_cast
<
half
*>
(
&
v
)
+
I
)
=
s
;
}
__host__
__device__
static
MemoryType
Pack
(
half
s0
,
half
s1
,
half
s2
,
half
s3
)
{
DataType
data
;
data
.
scalar
[
0
]
=
s0
;
data
.
scalar
[
1
]
=
s1
;
data
.
scalar
[
2
]
=
s2
;
data
.
scalar
[
3
]
=
s3
;
return
data
.
vector
;
}
};
template
<
>
struct
vector_type
<
ushort
,
1
>
{
using
MemoryType
=
ushort
;
template
<
index_t
I
>
__host__
__device__
static
void
SetScalar
(
MemoryType
&
v
,
ushort
s
,
Number
<
I
>
)
{
static_assert
(
I
<
1
,
"wrong"
);
*
(
reinterpret_cast
<
ushort
*>
(
&
v
)
+
I
)
=
s
;
}
};
template
<
>
struct
vector_type
<
ushort
,
2
>
{
using
MemoryType
=
ushort2_t
;
union
DataType
{
MemoryType
vector
;
ushort
scalar
[
2
];
};
template
<
index_t
I
>
__host__
__device__
static
void
SetScalar
(
MemoryType
&
v
,
ushort
s
,
Number
<
I
>
)
{
static_assert
(
I
<
2
,
"wrong"
);
*
(
reinterpret_cast
<
ushort
*>
(
&
v
)
+
I
)
=
s
;
}
__host__
__device__
static
MemoryType
Pack
(
ushort
s0
,
ushort
s1
)
{
DataType
data
;
data
.
scalar
[
0
]
=
s0
;
data
.
scalar
[
1
]
=
s1
;
return
data
.
vector
;
}
};
template
<
>
struct
vector_type
<
ushort
,
4
>
{
using
MemoryType
=
ushort4_t
;
union
DataType
{
MemoryType
vector
;
ushort
scalar
[
4
];
};
template
<
index_t
I
>
__host__
__device__
static
void
SetScalar
(
MemoryType
&
v
,
ushort
s
,
Number
<
I
>
)
{
static_assert
(
I
<
4
,
"wrong"
);
*
(
reinterpret_cast
<
ushort
*>
(
&
v
)
+
I
)
=
s
;
}
__host__
__device__
static
MemoryType
Pack
(
ushort
s0
,
ushort
s1
,
ushort
s2
,
ushort
s3
)
{
DataType
data
;
data
.
scalar
[
0
]
=
s0
;
data
.
scalar
[
1
]
=
s1
;
data
.
scalar
[
2
]
=
s2
;
data
.
scalar
[
3
]
=
s3
;
return
data
.
vector
;
}
};
// data type conversion
template
<
typename
T
>
struct
type_convert
{
template
<
typename
X
>
__device__
T
operator
()(
X
x
)
const
{
return
static_cast
<
T
>
(
x
);
}
};
template
<
>
template
<
>
__device__
float
type_convert
<
float
>::
operator
()
<
ushort
>
(
ushort
x
)
const
{
return
bfloat16_to_float
(
x
);
}
template
<
>
template
<
>
__device__
ushort
type_convert
<
ushort
>::
operator
()
<
float
>
(
float
x
)
const
{
return
float_to_bfloat16
(
x
);
}
template
<
typename
T
>
struct
inner_product_with_conversion
{
static
constexpr
auto
convert
=
type_convert
<
T
>
();
__device__
T
operator
()(
float
a
,
float
b
)
const
{
return
convert
(
a
)
*
convert
(
b
);
}
__device__
T
operator
()(
half2_t
a
,
half2_t
b
)
const
{
const
half
*
p_a_half
=
reinterpret_cast
<
const
half
*>
(
&
a
);
const
half
*
p_b_half
=
reinterpret_cast
<
const
half
*>
(
&
b
);
T
acc
=
0
;
for
(
index_t
v
=
0
;
v
<
2
;
++
v
)
{
acc
+=
convert
(
p_a_half
[
v
])
*
convert
(
p_b_half
[
v
]);
}
return
acc
;
}
__device__
T
operator
()(
half4_t
a
,
half4_t
b
)
const
{
const
half
*
p_a_half
=
reinterpret_cast
<
const
half
*>
(
&
a
);
const
half
*
p_b_half
=
reinterpret_cast
<
const
half
*>
(
&
b
);
T
acc
=
0
;
for
(
index_t
v
=
0
;
v
<
4
;
++
v
)
{
acc
+=
convert
(
p_a_half
[
v
])
*
convert
(
p_b_half
[
v
]);
}
return
acc
;
}
__device__
T
operator
()(
ushort2_t
a
,
ushort2_t
b
)
const
{
const
ushort
*
p_a_bfloat16
=
reinterpret_cast
<
const
ushort
*>
(
&
a
);
const
ushort
*
p_b_bfloat16
=
reinterpret_cast
<
const
ushort
*>
(
&
b
);
T
acc
=
0
;
for
(
index_t
v
=
0
;
v
<
2
;
++
v
)
{
acc
+=
convert
(
p_a_bfloat16
[
v
])
*
convert
(
p_b_bfloat16
[
v
]);
}
return
acc
;
}
__device__
T
operator
()(
ushort4_t
a
,
ushort4_t
b
)
const
{
const
ushort
*
p_a_bfloat16
=
reinterpret_cast
<
const
ushort
*>
(
&
a
);
const
ushort
*
p_b_bfloat16
=
reinterpret_cast
<
const
ushort
*>
(
&
b
);
T
acc
=
0
;
for
(
index_t
v
=
0
;
v
<
4
;
++
v
)
{
acc
+=
convert
(
p_a_bfloat16
[
v
])
*
convert
(
p_b_bfloat16
[
v
]);
}
return
acc
;
}
};
}
// namespace ck
#endif
driver/src/driver.cpp
View file @
562e1e27
...
@@ -297,7 +297,7 @@ int main(int argc, char* argv[])
...
@@ -297,7 +297,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
1
#elif
0
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
...
@@ -343,7 +343,7 @@ int main(int argc, char* argv[])
...
@@ -343,7 +343,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
3
,
0
>
;
using
LeftPads
=
Sequence
<
3
,
0
>
;
using
RightPads
=
Sequence
<
3
,
0
>
;
using
RightPads
=
Sequence
<
3
,
0
>
;
#elif
0
#elif
1
// 1x7 filter, 0x3 pad, 17x17 input
// 1x7 filter, 0x3 pad, 17x17 input
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
128
;
constexpr
index_t
C
=
128
;
...
@@ -450,7 +450,7 @@ int main(int argc, char* argv[])
...
@@ -450,7 +450,7 @@ int main(int argc, char* argv[])
ConvStrides
{},
ConvStrides
{},
ConvDilations
{},
ConvDilations
{},
nrepeat
);
nrepeat
);
#elif
0
#elif
1
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw
(
in_nchw_desc
,
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
...
@@ -492,7 +492,7 @@ int main(int argc, char* argv[])
...
@@ -492,7 +492,7 @@ int main(int argc, char* argv[])
ConvStrides
{},
ConvStrides
{},
ConvDilations
{},
ConvDilations
{},
nrepeat
);
nrepeat
);
#elif
1
#elif
0
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
in_nchw_desc
,
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment