Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
184c6e7d
"...text-generation-inference.git" did not exist on "4f4857a4ac4d09483f72465e5adcd29f38b03b16"
Commit
184c6e7d
authored
Sep 20, 2019
by
Chao Liu
Browse files
nvidia build
parent
f00c1381
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
41 additions
and
554 deletions
+41
-554
composable_kernel/include/tensor_description/tensor_coordinate.hpp
...e_kernel/include/tensor_description/tensor_coordinate.hpp
+2
-2
composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
...ernel/include/tensor_description/tensor_coordinate_v2.hpp
+2
-2
composable_kernel/include/tensor_description/tensor_descriptor.hpp
...e_kernel/include/tensor_description/tensor_descriptor.hpp
+1
-1
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+24
-44
composable_kernel/include/utility/amd_inline_asm.hpp
composable_kernel/include/utility/amd_inline_asm.hpp
+0
-501
composable_kernel/include/utility/common_header.hpp
composable_kernel/include/utility/common_header.hpp
+4
-0
composable_kernel/include/utility/config_amd.hpp.in
composable_kernel/include/utility/config_amd.hpp.in
+4
-2
composable_kernel/include/utility/config_nvidia.hpp.in
composable_kernel/include/utility/config_nvidia.hpp.in
+4
-2
No files found.
composable_kernel/include/tensor_description/tensor_coordinate.hpp
View file @
184c6e7d
...
@@ -325,14 +325,14 @@ struct TensorCoordinate
...
@@ -325,14 +325,14 @@ struct TensorCoordinate
private:
private:
template
<
class
...
Ts
>
template
<
class
...
Ts
>
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
ConstantTensorDescriptor
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
ConstantTensorDescriptor
<
Ts
...
>
)
{
{
return
NormalTensorCoordinate
<
ConstantTensorDescriptor
<
Ts
...
>>
();
return
NormalTensorCoordinate
<
ConstantTensorDescriptor
<
Ts
...
>>
();
}
}
template
<
class
...
Ts
>
template
<
class
...
Ts
>
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
ConstantMergedTensorDescriptor
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
ConstantMergedTensorDescriptor
<
Ts
...
>
)
{
{
return
MergedTensorCoordinate
<
ConstantMergedTensorDescriptor
<
Ts
...
>>
();
return
MergedTensorCoordinate
<
ConstantMergedTensorDescriptor
<
Ts
...
>>
();
}
}
...
...
composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
View file @
184c6e7d
...
@@ -188,7 +188,7 @@ struct TensorCoordinate_v2
...
@@ -188,7 +188,7 @@ struct TensorCoordinate_v2
private:
private:
template
<
typename
...
Ts
>
template
<
typename
...
Ts
>
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
NativeTensorDescriptor
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
NativeTensorDescriptor
<
Ts
...
>
)
{
{
return
NativeTensorCoordinate
<
NativeTensorDescriptor
<
Ts
...
>>
(
return
NativeTensorCoordinate
<
NativeTensorDescriptor
<
Ts
...
>>
(
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
...
@@ -196,7 +196,7 @@ struct TensorCoordinate_v2
...
@@ -196,7 +196,7 @@ struct TensorCoordinate_v2
template
<
typename
...
Ts
>
template
<
typename
...
Ts
>
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
TransformedTensorDescriptor
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
TransformedTensorDescriptor
<
Ts
...
>
)
{
{
return
TransformedTensorCoordinate
<
TransformedTensorDescriptor
<
Ts
...
>>
(
return
TransformedTensorCoordinate
<
TransformedTensorDescriptor
<
Ts
...
>>
(
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
...
...
composable_kernel/include/tensor_description/tensor_descriptor.hpp
View file @
184c6e7d
...
@@ -346,7 +346,7 @@ struct TransformedTensorDescriptor
...
@@ -346,7 +346,7 @@ struct TransformedTensorDescriptor
return
GetLowerTensorDescriptor
().
CalculateOffset
(
CalculateLowerIndex
(
idx_up
));
return
GetLowerTensorDescriptor
().
CalculateOffset
(
CalculateLowerIndex
(
idx_up
));
}
}
#if
0
#if
1
struct
lambda_sequence_logic_or
struct
lambda_sequence_logic_or
{
{
template
<
typename
...
Seqs
>
template
<
typename
...
Seqs
>
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
184c6e7d
...
@@ -21,6 +21,10 @@
...
@@ -21,6 +21,10 @@
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#endif
#endif
#ifndef CK_EXPERIMENTAL_USE_AMD_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
#define CK_EXPERIMENTAL_USE_AMD_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#endif
namespace
ck
{
namespace
ck
{
// This threadwise copy allow vector access of src and dst.
// This threadwise copy allow vector access of src and dst.
...
@@ -835,19 +839,15 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
...
@@ -835,19 +839,15 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
// 2. src_normal_offset must be calculatd at compile time (guaranteed)
// 2. src_normal_offset must be calculatd at compile time (guaranteed)
// 3. src_merged_offset can be runtime value (no assumption imposed)
// 3. src_merged_offset can be runtime value (no assumption imposed)
static_if
<
SrcMemorySpace
==
2
>
{}([
&
](
auto
)
{
static_if
<
SrcMemorySpace
==
2
>
{}([
&
](
auto
)
{
#if 0 // source code
#if CK_USE_AMD_INTRINSIC && \
vector_data = *reinterpret_cast<const src_vector_t*>(
CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
&p_src[src_normal_offset + src_merged_offset]);
#elif
0
// inline asm using global_load
vector_data
=
__global_load
<
TData
,
SrcDataPerAccess
>
(
p_src
,
static_cast
<
uint32_t
>
(
src_merged_offset
),
static_cast
<
uint32_t
>
(
src_normal_offset
));
#elif 1 // inline asm using buffer_load
vector_data
=
__buffer_load
<
TData
,
SrcDataPerAccess
>
(
vector_data
=
__buffer_load
<
TData
,
SrcDataPerAccess
>
(
p_src
,
p_src
,
static_cast
<
uint32_t
>
(
src_merged_offset
),
static_cast
<
uint32_t
>
(
src_merged_offset
),
static_cast
<
uint32_t
>
(
src_normal_offset
));
static_cast
<
uint32_t
>
(
src_normal_offset
));
#else
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_normal_offset
+
src_merged_offset
]);
#endif
#endif
}).
Else
([
&
](
auto
)
{
}).
Else
([
&
](
auto
)
{
// src can be all kinds of memory-space.
// src can be all kinds of memory-space.
...
@@ -940,15 +940,13 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
...
@@ -940,15 +940,13 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
// 2. dst_normal_offset must be calculatd at compile time (guaranteed)
// 2. dst_normal_offset must be calculatd at compile time (guaranteed)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
static_if
<
DstMemorySpace
==
2
>
{}([
&
](
auto
)
{
static_if
<
DstMemorySpace
==
2
>
{}([
&
](
auto
)
{
#if 0 // source code
#if CK_USE_AMD_INTRINSIC && \
*reinterpret_cast<dst_vector_t*>(
CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
&p_dst[dst_normal_offset + dst_merged_offset]) = vector_data;
#elif
0
// inline asm using global_store
__global_store
<
TData
,
DstDataPerAccess
>
(
vector_data
,
p_dst
,
dst_merged_offset
,
dst_normal_offset
);
#elif 1 // inline asm using buffer_store
__buffer_store
<
TData
,
DstDataPerAccess
>
(
__buffer_store
<
TData
,
DstDataPerAccess
>
(
vector_data
,
p_dst
,
dst_merged_offset
,
dst_normal_offset
);
vector_data
,
p_dst
,
dst_merged_offset
,
dst_normal_offset
);
#else
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_normal_offset
+
dst_merged_offset
])
=
vector_data
;
#endif
#endif
}).
Else
([
&
](
auto
)
{
}).
Else
([
&
](
auto
)
{
// dst can be all kinds of memory-space
// dst can be all kinds of memory-space
...
@@ -1053,15 +1051,6 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1
...
@@ -1053,15 +1051,6 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1
auto
src_slice_vectorized
=
auto
src_slice_vectorized
=
mSrcSlice
.
Vectorize
(
src_vector_access_dim
,
src_data_per_access
);
mSrcSlice
.
Vectorize
(
src_vector_access_dim
,
src_data_per_access
);
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor("mSrcSlice: ", typename decltype(mSrcSlice)::tensor_desc_type{});
print_ConstantTensorDescriptor("src_slice_vector: ", typename decltype(src_slice_vectorized)::tensor_desc_type{});
}
#endif
#if 1 // debug
ford
<
decltype
(
src_slice_vectorized
.
GetLengths
()),
SrcDimAccessOrder
>
{}(
ford
<
decltype
(
src_slice_vectorized
.
GetLengths
()),
SrcDimAccessOrder
>
{}(
[
&
](
auto
src_vector_id
)
{
[
&
](
auto
src_vector_id
)
{
// load vector from src
// load vector from src
...
@@ -1080,7 +1069,6 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1
...
@@ -1080,7 +1069,6 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1
reinterpret_cast
<
const
data_type
*>
(
&
vector_data
)[
i
];
reinterpret_cast
<
const
data_type
*>
(
&
vector_data
)[
i
];
}
}
});
});
#endif
}
}
// copy data from buffer into dst
// copy data from buffer into dst
...
@@ -1093,15 +1081,6 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1
...
@@ -1093,15 +1081,6 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1
auto
dst_slice_vectorized
=
auto
dst_slice_vectorized
=
mDstSlice
.
Vectorize
(
dst_vector_access_dim
,
dst_data_per_access
);
mDstSlice
.
Vectorize
(
dst_vector_access_dim
,
dst_data_per_access
);
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor("mDstSlice: ", typename decltype(mDstSlice)::tensor_desc_type{});
print_ConstantTensorDescriptor("dst_slice_vector: ", typename decltype(dst_slice_vectorized)::tensor_desc_type{});
}
#endif
#if 1 // debug
ford
<
decltype
(
dst_slice_vectorized
.
GetLengths
()),
DstDimAccessOrder
>
{}(
ford
<
decltype
(
dst_slice_vectorized
.
GetLengths
()),
DstDimAccessOrder
>
{}(
[
&
](
auto
dst_vector_id
)
{
[
&
](
auto
dst_vector_id
)
{
...
@@ -1122,7 +1101,6 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1
...
@@ -1122,7 +1101,6 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1
// write vector into dst
// write vector into dst
dst_slice_vectorized
(
dst_vector_id
)
=
vector_data
;
dst_slice_vectorized
(
dst_vector_id
)
=
vector_data
;
});
});
#endif
}
}
}
}
...
@@ -1330,13 +1308,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -1330,13 +1308,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
const
index_t
buffer_offset
=
i
*
src_data_per_access
;
const
index_t
buffer_offset
=
i
*
src_data_per_access
;
static_if
<
SrcMemorySpace
==
2
>
{}([
&
](
auto
)
{
static_if
<
SrcMemorySpace
==
2
>
{}([
&
](
auto
)
{
#if 0 // source code
#if CK_USE_AMD_INTRINSIC && \
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
*reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
#elif
1
// inline asm using buffer_load
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_long_vector
[
buffer_offset
])
=
__buffer_load
<
TData
,
SrcDataPerAccess
>
(
__buffer_load
<
TData
,
SrcDataPerAccess
>
(
p_src
,
static_cast
<
uint32_t
>
(
src_offset
),
static_cast
<
uint32_t
>
(
0
));
p_src
,
static_cast
<
uint32_t
>
(
src_offset
),
static_cast
<
uint32_t
>
(
0
));
#else
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_offset
]);
#endif
#endif
}).
Else
([
&
](
auto
)
{
}).
Else
([
&
](
auto
)
{
// src can be all kinds of memory-space.
// src can be all kinds of memory-space.
...
@@ -1358,15 +1337,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -1358,15 +1337,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
(
mDstSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
)).
GetOffset
();
(
mDstSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
)).
GetOffset
();
static_if
<
DstMemorySpace
==
2
>
{}([
&
](
auto
)
{
static_if
<
DstMemorySpace
==
2
>
{}([
&
](
auto
)
{
#if 0 // source code
#if CK_USE_AMD_INTRINSIC && \
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
#elif
1
// inline asm using buffer_store
__buffer_store
<
TData
,
DstDataPerAccess
>
(
__buffer_store
<
TData
,
DstDataPerAccess
>
(
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_long_vector
[
buffer_offset
]),
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_long_vector
[
buffer_offset
]),
p_dst
,
p_dst
,
dst_offset
,
dst_offset
,
0
);
0
);
#else
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_offset
])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_long_vector
[
buffer_offset
]);
#endif
#endif
}).
Else
([
&
](
auto
)
{
}).
Else
([
&
](
auto
)
{
// dst can be all kinds of memory-space
// dst can be all kinds of memory-space
...
...
composable_kernel/include/utility/amd_inline_asm.hpp
View file @
184c6e7d
...
@@ -8,507 +8,6 @@ namespace ck {
...
@@ -8,507 +8,6 @@ namespace ck {
// cast a pointer of LDS to its address
// cast a pointer of LDS to its address
extern
"C"
__attribute__
((
address_space
(
3
)))
__device__
void
*
__to_local
(
void
*
p
);
extern
"C"
__attribute__
((
address_space
(
3
)))
__device__
void
*
__to_local
(
void
*
p
);
__device__
float
__llvm_amdgcn_buffer_load
(
int32x4_t
rsrc
,
uint32_t
vindex
,
uint32_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load"
);
__device__
vector_type
<
float
,
2
>::
MemoryType
__llvm_amdgcn_buffer_loadx2
(
int32x4_t
rsrc
,
uint32_t
vindex
,
uint32_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.dwordx2"
);
__device__
vector_type
<
float
,
4
>::
MemoryType
__llvm_amdgcn_buffer_loadx4
(
int32x4_t
rsrc
,
uint32_t
vindex
,
uint32_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.dwordx4"
);
__device__
void
__llvm_amdgcn_buffer_store
(
float
vdata
,
int32x4_t
rsrc
,
uint32_t
vindex
,
uint32_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store"
);
__device__
void
__llvm_amdgcn_buffer_storex2
(
vector_type
<
float
,
2
>::
MemoryType
vdata
,
int32x4_t
rsrc
,
uint32_t
vindex
,
uint32_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.dwordx2"
);
__device__
void
__llvm_amdgcn_buffer_storex4
(
vector_type
<
float
,
4
>::
MemoryType
vdata
,
int32x4_t
rsrc
,
uint32_t
vindex
,
uint32_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.dwordx4"
);
// global_load and global_store
template
<
typename
T
,
index_t
VectorSize
>
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
__global_load
(
const
T
*
p_src_block
,
uint32_t
src_thread_data_offset
,
uint32_t
src_const_data_offset
);
template
<
typename
T
,
index_t
VectorSize
>
__device__
void
__global_store
(
const
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
&
src
,
T
*
p_dst_block
,
uint32_t
dst_thread_data_offset
,
uint32_t
dst_const_data_offset
);
template
<
>
__device__
float
__global_load
<
float
,
1
>
(
const
float
*
p_src_block
,
uint32_t
src_thread_data_offset
,
uint32_t
src_const_data_offset
)
{
float
dst
;
#if 0 // source code
dst = p_src_block[src_const_data_offset + src_thread_data_offset];
#elif
0
// use VGPR only
const
float
*
src_thread_addr_offset_u64
=
p_src_block
+
src_const_data_offset
+
src_thread_data_offset
;
asm
volatile
(
"
\n
\
global_load_dword %0, %1 off offset:0
\n
\
s_waitcnt 0
\n
\
"
:
"=v"
(
dst
)
:
"v"
(
src_thread_addr_offset_u64
));
#elif 0 // use VGPR and SGPR, do compute on VALU
uint64_t
src_thread_addr_offset_u64
=
(
src_thread_data_offset
+
src_const_data_offset
)
*
sizeof
(
float
);
asm
volatile
(
"
\n
\
global_load_dword %0, %1, %2, offset:0
\n
\
s_waitcnt 0
\n
\
"
:
"=v"
(
dst
)
:
"v"
(
src_thread_addr_offset_u64
),
"s"
(
p_src_block
));
#elif 1 // use VGPR and SGPR, do compute on SALU
uint64_t
src_thread_addr_offset_u64
=
static_cast
<
uint64_t
>
(
src_thread_data_offset
*
sizeof
(
float
));
const
float
*
p_src_block_with_offset
=
p_src_block
+
src_const_data_offset
;
asm
volatile
(
"
\n
\
global_load_dword %0, %1, %2, offset:0
\n
\
s_waitcnt 0
\n
\
"
:
"=v"
(
dst
)
:
"v"
(
src_thread_addr_offset_u64
),
"s"
(
p_src_block_with_offset
));
#endif
return
dst
;
}
template
<
>
__device__
vector_type
<
float
,
2
>::
MemoryType
__global_load
<
float
,
2
>
(
const
float
*
p_src_block
,
uint32_t
src_thread_data_offset
,
uint32_t
src_const_data_offset
)
{
using
vector_t
=
vector_type
<
float
,
2
>::
MemoryType
;
vector_t
dst
;
#if 0 // source code
dst = *reinterpret_cast<const vector_t*>(&p_src_block[src_const_data_offset + src_thread_data_offset]);
#elif
0
// use VGPR only
const
float
*
src_thread_addr_offset_u64
=
p_src_block
+
src_const_data_offset
+
src_thread_data_offset
;
asm
volatile
(
"
\n
\
global_load_dwordx2 %0, %1 off offset:0
\n
\
s_waitcnt 0
\n
\
"
:
"=v"
(
dst
)
:
"v"
(
src_thread_addr_offset_u64
));
#elif 0 // use VGPR and SGPR, do compute on VALU
uint64_t
src_thread_addr_offset_u64
=
(
src_thread_data_offset
+
src_const_data_offset
)
*
sizeof
(
float
);
asm
volatile
(
"
\n
\
global_load_dwordx2 %0, %1, %2, offset:0
\n
\
s_waitcnt 0
\n
\
"
:
"=v"
(
dst
)
:
"v"
(
src_thread_addr_offset_u64
),
"s"
(
p_src_block
));
#elif 1 // use VGPR and SGPR, do compute on SALU
uint64_t
src_thread_addr_offset_u64
=
static_cast
<
uint64_t
>
(
src_thread_data_offset
*
sizeof
(
float
));
const
float
*
p_src_block_with_offset
=
p_src_block
+
src_const_data_offset
;
asm
volatile
(
"
\n
\
global_load_dwordx2 %0, %1, %2, offset:0
\n
\
s_waitcnt 0
\n
\
"
:
"=v"
(
dst
)
:
"v"
(
src_thread_addr_offset_u64
),
"s"
(
p_src_block_with_offset
));
#endif
return
dst
;
}
template
<
>
__device__
vector_type
<
float
,
4
>::
MemoryType
__global_load
<
float
,
4
>
(
const
float
*
p_src_block
,
uint32_t
src_thread_data_offset
,
uint32_t
src_const_data_offset
)
{
using
vector_t
=
vector_type
<
float
,
4
>::
MemoryType
;
vector_t
dst
;
#if 0 // source code
dst = *reinterpret_cast<const vector_t*>(&p_src_block[src_const_data_offset + src_thread_data_offset]);
#elif
0
// use VGPR only
const
float
*
src_thread_addr_offset_u64
=
p_src_block
+
src_const_data_offset
+
src_thread_data_offset
;
asm
volatile
(
"
\n
\
global_load_dwordx4 %0, %1 off offset:0
\n
\
s_waitcnt 0
\n
\
"
:
"=v"
(
dst
)
:
"v"
(
src_thread_addr_offset_u64
));
#elif 0 // use VGPR and SGPR, do compute on VALU
uint64_t
src_thread_addr_offset_u64
=
(
src_thread_data_offset
+
src_const_data_offset
)
*
sizeof
(
float
);
asm
volatile
(
"
\n
\
global_load_dwordx4 %0, %1, %2, offset:0
\n
\
s_waitcnt 0
\n
\
"
:
"=v"
(
dst
)
:
"v"
(
src_thread_addr_offset_u64
),
"s"
(
p_src_block
));
#elif 1 // use VGPR and SGPR, do compute on SALU
uint64_t
src_thread_addr_offset_u64
=
static_cast
<
uint64_t
>
(
src_thread_data_offset
*
sizeof
(
float
));
const
float
*
p_src_block_with_offset
=
p_src_block
+
src_const_data_offset
;
asm
volatile
(
"
\n
\
global_load_dwordx4 %0, %1, %2, offset:0
\n
\
s_waitcnt 0
\n
\
"
:
"=v"
(
dst
)
:
"v"
(
src_thread_addr_offset_u64
),
"s"
(
p_src_block_with_offset
));
#endif
return
dst
;
}
template
<
>
__device__
void
__global_store
<
float
,
1
>
(
const
float
&
src
,
float
*
p_dst_block
,
uint32_t
dst_thread_data_offset
,
uint32_t
dst_const_data_offset
)
{
#if 0 // compute on VALU
uint64_t dst_thread_data_offset_u64 = (dst_thread_data_offset + dst_const_data_offset) * sizeof(float);
asm volatile("\n \
global_store_dword %0, %1, %2, offset:0 \n \
"
:
: "v"(dst_thread_data_offset_u64), "v"(src), "s"(p_dst_block));
#else
// compute on SALU
uint64_t
dst_thread_data_offset_u64
=
dst_thread_data_offset
*
sizeof
(
float
);
float
*
p_dst_block_with_offset
=
p_dst_block
+
dst_const_data_offset
;
asm
volatile
(
"
\n
\
global_store_dword %0, %1, %2, offset:0
\n
\
"
:
:
"v"
(
dst_thread_data_offset_u64
),
"v"
(
src
),
"s"
(
p_dst_block_with_offset
));
#endif
}
// buffer_load and buffer_store
template
<
typename
T
,
index_t
VectorSize
>
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
__buffer_load
(
const
T
*
p_src_block
,
uint32_t
src_thread_data_offset
,
uint32_t
src_const_data_offset
);
template
<
typename
T
,
index_t
VectorSize
>
__device__
void
__buffer_store
(
const
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
&
src
,
T
*
p_dst_block
,
uint32_t
dst_thread_data_offset
,
uint32_t
dst_const_data_offset
);
template
<
>
__device__
float
__buffer_load
<
float
,
1
>
(
const
float
*
p_src_block
,
uint32_t
src_thread_data_offset
,
uint32_t
src_const_data_offset
)
{
#if 0
float dst;
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float);
int32x4_t src_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
// fill in byte 2
reinterpret_cast<int*>(&src_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
return dst;
#else
float
dst
;
uint32_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
uint32_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
float
);
int32x4_t
src_block_setting
{
0
};
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
src_block_setting
)
=
const_cast
<
float
*>
(
p_src_block
);
// fill in byte 2
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
2
]
=
-
1
;
// fill in byte 3
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
3
]
=
0x00027000
;
dst
=
__llvm_amdgcn_buffer_load
(
src_block_setting
,
0
,
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
return
dst
;
#endif
}
template
<
>
__device__
vector_type
<
float
,
2
>::
MemoryType
__buffer_load
<
float
,
2
>
(
const
float
*
p_src_block
,
uint32_t
src_thread_data_offset
,
uint32_t
src_const_data_offset
)
{
#if 0
vector_type<float, 2>::MemoryType dst;
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float);
int32x4_t src_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
// fill in byte 2
reinterpret_cast<int*>(&src_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
return dst;
#else
vector_type
<
float
,
2
>::
MemoryType
dst
;
uint32_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
uint32_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
float
);
int32x4_t
src_block_setting
{
0
};
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
src_block_setting
)
=
const_cast
<
float
*>
(
p_src_block
);
// fill in byte 2
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
2
]
=
-
1
;
// fill in byte 3
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
3
]
=
0x00027000
;
dst
=
__llvm_amdgcn_buffer_loadx2
(
src_block_setting
,
0
,
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
return
dst
;
#endif
}
template
<
>
__device__
vector_type
<
float
,
4
>::
MemoryType
__buffer_load
<
float
,
4
>
(
const
float
*
p_src_block
,
uint32_t
src_thread_data_offset
,
uint32_t
src_const_data_offset
)
{
#if 0
vector_type<float, 4>::MemoryType dst;
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float);
int32x4_t src_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
// fill in byte 2
reinterpret_cast<int*>(&src_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
return dst;
#elif
1
vector_type
<
float
,
4
>::
MemoryType
dst
;
uint32_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
uint32_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
float
);
int32x4_t
src_block_setting
{
0
};
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
src_block_setting
)
=
const_cast
<
float
*>
(
p_src_block
);
// fill in byte 2
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
2
]
=
-
1
;
// fill in byte 3
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
3
]
=
0x00027000
;
dst
=
__llvm_amdgcn_buffer_loadx4
(
src_block_setting
,
0
,
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
return
dst
;
#endif
}
template
<
>
__device__
void
__buffer_store
<
float
,
1
>
(
const
float
&
src
,
float
*
p_dst_block
,
uint32_t
dst_thread_data_offset
,
uint32_t
dst_const_data_offset
)
{
#if 0
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
int32x4_t dst_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
// fill in byte 2
reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
"
:
: "s"(dst_block_setting),
"v"(src),
"v"(dst_thread_addr_offset),
"s"(dst_const_addr_offset));
#else
uint32_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
uint32_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
int32x4_t
dst_block_setting
{
0
};
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
dst_block_setting
)
=
p_dst_block
;
// fill in byte 2
reinterpret_cast
<
int
*>
(
&
dst_block_setting
)[
2
]
=
-
1
;
// fill in byte 3
reinterpret_cast
<
int
*>
(
&
dst_block_setting
)[
3
]
=
0x00027000
;
__llvm_amdgcn_buffer_store
(
src
,
dst_block_setting
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#endif
}
template
<
>
__device__
void
__buffer_store
<
float
,
2
>
(
const
vector_type
<
float
,
2
>::
MemoryType
&
src
,
float
*
p_dst_block
,
uint32_t
dst_thread_data_offset
,
uint32_t
dst_const_data_offset
)
{
#if 0
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
int32x4_t dst_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
// fill in byte 2
reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_store_dwordx2 %1, %2, %0, %3 offen offset:0 \n \
"
:
: "s"(dst_block_setting),
"v"(src),
"v"(dst_thread_addr_offset),
"s"(dst_const_addr_offset));
#else
uint32_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
uint32_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
int32x4_t
dst_block_setting
{
0
};
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
dst_block_setting
)
=
p_dst_block
;
// fill in byte 2
reinterpret_cast
<
int
*>
(
&
dst_block_setting
)[
2
]
=
-
1
;
// fill in byte 3
reinterpret_cast
<
int
*>
(
&
dst_block_setting
)[
3
]
=
0x00027000
;
__llvm_amdgcn_buffer_storex2
(
src
,
dst_block_setting
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#endif
}
template
<
>
__device__
void
__buffer_store
<
float
,
4
>
(
const
vector_type
<
float
,
4
>::
MemoryType
&
src
,
float
*
p_dst_block
,
uint32_t
dst_thread_data_offset
,
uint32_t
dst_const_data_offset
)
{
#if 0
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
int32x4_t dst_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
// fill in byte 2
reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_store_dwordx4 %1, %2, %0, %3 offen offset:0 \n \
"
:
: "s"(dst_block_setting),
"v"(src),
"v"(dst_thread_addr_offset),
"s"(dst_const_addr_offset));
#else
uint32_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
uint32_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
int32x4_t
dst_block_setting
{
0
};
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
dst_block_setting
)
=
p_dst_block
;
// fill in byte 2
reinterpret_cast
<
int
*>
(
&
dst_block_setting
)[
2
]
=
-
1
;
// fill in byte 3
reinterpret_cast
<
int
*>
(
&
dst_block_setting
)[
3
]
=
0x00027000
;
__llvm_amdgcn_buffer_storex4
(
src
,
dst_block_setting
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#endif
}
__device__
void
vmcnt
(
index_t
cnt
)
__device__
void
vmcnt
(
index_t
cnt
)
{
{
if
(
cnt
==
0
)
if
(
cnt
==
0
)
...
...
composable_kernel/include/utility/common_header.hpp
View file @
184c6e7d
...
@@ -22,4 +22,8 @@
...
@@ -22,4 +22,8 @@
#include "amd_inline_asm.hpp"
#include "amd_inline_asm.hpp"
#endif
#endif
#if CK_USE_AMD_INTRINCIS
#include "amd_intrinsic.hpp"
#endif
#endif
#endif
composable_kernel/include/utility/config_amd.hpp.in
View file @
184c6e7d
...
@@ -4,9 +4,11 @@
...
@@ -4,9 +4,11 @@
#include "hip/hip_runtime.h"
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#include "hip/hip_fp16.h"
#define CK_UNSIGNED_INDEX_TYPE 0
#define CK_DEVICE_BACKEND_AMD 1
#define CK_DEVICE_BACKEND_AMD 1
#define CK_USE_
UNSIGNED_INDEX_TYPE 0
#define CK_USE_
AMD_INTRINSIC 1
#define CK_USE_AMD_INLINE_ASM 1
#define CK_USE_AMD_INLINE_ASM 1
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
...
@@ -14,7 +16,7 @@
...
@@ -14,7 +16,7 @@
namespace ck {
namespace ck {
#if CK_
USE_
UNSIGNED_INDEX_TYPE
#if CK_UNSIGNED_INDEX_TYPE
using index_t = uint32_t;
using index_t = uint32_t;
#else
#else
using index_t = int32_t;
using index_t = int32_t;
...
...
composable_kernel/include/utility/config_nvidia.hpp.in
View file @
184c6e7d
...
@@ -6,9 +6,11 @@
...
@@ -6,9 +6,11 @@
#include "nvToolsExt.h"
#include "nvToolsExt.h"
#include "helper_cuda.h"
#include "helper_cuda.h"
#define CK_UNSIGNED_INDEX_TYPE 0
#define CK_DEVICE_BACKEND_NVIDIA 1
#define CK_DEVICE_BACKEND_NVIDIA 1
#define CK_USE_
UNSIGNED_INDEX_TYPE
0
#define CK_USE_
AMD_INTRINSIC
0
#define CK_USE_AMD_INLINE_ASM 0
#define CK_USE_AMD_INLINE_ASM 0
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
...
@@ -16,7 +18,7 @@
...
@@ -16,7 +18,7 @@
namespace ck {
namespace ck {
#if CK_
USE_
UNSIGNED_INDEX_TYPE
#if CK_UNSIGNED_INDEX_TYPE
using index_t = uint32_t;
using index_t = uint32_t;
#else
#else
using index_t = int32_t;
using index_t = int32_t;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment