Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
c03945f0
Commit
c03945f0
authored
Apr 09, 2021
by
Chao Liu
Browse files
revert changes to buffer addressing that is irrelavent to magic number division
parent
0dc0fa5c
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
11 additions
and
83 deletions
+11
-83
composable_kernel/include/utility/amd_buffer_addressing.hpp
composable_kernel/include/utility/amd_buffer_addressing.hpp
+0
-11
composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
...sable_kernel/include/utility/amd_buffer_addressing_v2.hpp
+8
-68
composable_kernel/include/utility/config.amd.hpp.in
composable_kernel/include/utility/config.amd.hpp.in
+3
-3
composable_kernel/include/utility/float_type.amd.hpp.in
composable_kernel/include/utility/float_type.amd.hpp.in
+0
-1
No files found.
composable_kernel/include/utility/amd_buffer_addressing.hpp
View file @
c03945f0
...
@@ -6,17 +6,6 @@
...
@@ -6,17 +6,6 @@
namespace
ck
{
namespace
ck
{
template
<
typename
T
>
union
BufferResource
{
// 128 bit SGPRs to supply buffer resource in buffer instructions
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
int32x4_t
data
;
T
*
address
[
2
];
int32_t
range
[
4
];
int32_t
config
[
4
];
};
__device__
float
__llvm_amdgcn_buffer_load_f32
(
int32x4_t
srsrc
,
__device__
float
__llvm_amdgcn_buffer_load_f32
(
int32x4_t
srsrc
,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
...
...
composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
View file @
c03945f0
...
@@ -6,27 +6,27 @@
...
@@ -6,27 +6,27 @@
namespace
ck
{
namespace
ck
{
template
<
typename
T
>
template
<
typename
T
>
union
BufferResource
_v2
union
BufferResource
{
{
// 128 bit SGPRs to supply buffer resource in buffer instructions
// 128 bit SGPRs to supply buffer resource in buffer instructions
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
int32x4_t
data
;
int32x4_t
data
;
StaticallyIndexedArray
<
T
*
,
2
>
address
;
T
*
address
[
2
]
;
StaticallyIndexedArray
<
int32_t
,
4
>
range
;
int32_t
range
[
4
]
;
StaticallyIndexedArray
<
int32_t
,
4
>
config
;
int32_t
config
[
4
]
;
};
};
template
<
typename
T
>
template
<
typename
T
>
__device__
int32x4_t
make_wave_buffer_resource
(
T
*
p_wave
,
index_t
data_space_size
)
__device__
int32x4_t
make_wave_buffer_resource
(
T
*
p_wave
,
index_t
data_space_size
)
{
{
BufferResource
_v2
<
T
>
wave_buffer_resource
;
BufferResource
<
T
>
wave_buffer_resource
;
// wavewise base address (64 bit)
// wavewise base address (64 bit)
wave_buffer_resource
.
address
(
Number
<
0
>
{})
=
const_cast
<
remove_cv_t
<
T
>*>
(
p_wave
);
wave_buffer_resource
.
address
[
0
]
=
const_cast
<
remove_cv_t
<
T
>*>
(
p_wave
);
// wavewise range (32 bit)
// wavewise range (32 bit)
wave_buffer_resource
.
range
(
Number
<
2
>
{})
=
data_space_size
*
sizeof
(
T
);
wave_buffer_resource
.
range
[
2
]
=
data_space_size
*
sizeof
(
T
);
// wavewise setting (32 bit)
// wavewise setting (32 bit)
wave_buffer_resource
.
config
(
Number
<
3
>
{})
=
CK_BUFFER_RESOURCE_3RD_DWORD
;
wave_buffer_resource
.
config
[
3
]
=
CK_BUFFER_RESOURCE_3RD_DWORD
;
return
wave_buffer_resource
.
data
;
return
wave_buffer_resource
.
data
;
}
}
...
@@ -37,19 +37,6 @@ __llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc,
...
@@ -37,19 +37,6 @@ __llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc,
index_t
voffset
,
index_t
voffset
,
index_t
soffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.load.i8"
);
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.load.i8"
);
__device__
int8x2_t
__llvm_amdgcn_raw_buffer_load_i8x2
(
int32x4_t
srsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.load.v2i8"
);
__device__
int8x4_t
__llvm_amdgcn_raw_buffer_load_i8x4
(
int32x4_t
srsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.load.v4i8"
);
__device__
int16_t
__device__
int16_t
__llvm_amdgcn_raw_buffer_load_i16
(
int32x4_t
srsrc
,
__llvm_amdgcn_raw_buffer_load_i16
(
int32x4_t
srsrc
,
index_t
voffset
,
index_t
voffset
,
...
@@ -118,20 +105,6 @@ __llvm_amdgcn_raw_buffer_store_i8(int8_t vdata,
...
@@ -118,20 +105,6 @@ __llvm_amdgcn_raw_buffer_store_i8(int8_t vdata,
index_t
soffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.store.i8"
);
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.store.i8"
);
__device__
void
__llvm_amdgcn_raw_buffer_store_i8x2
(
int8x2_t
vdata
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.store.v2i8"
);
__device__
void
__llvm_amdgcn_raw_buffer_store_i8x4
(
int8x4_t
vdata
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.store.v4i8"
);
__device__
void
__device__
void
__llvm_amdgcn_raw_buffer_store_i16
(
int16_t
vdata
,
__llvm_amdgcn_raw_buffer_store_i16
(
int16_t
vdata
,
int32x4_t
rsrc
,
int32x4_t
rsrc
,
...
@@ -210,7 +183,6 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource,
...
@@ -210,7 +183,6 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource,
index_t
src_wave_addr_offset
)
index_t
src_wave_addr_offset
)
{
{
static_assert
((
is_same
<
T
,
float
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
))
||
static_assert
((
is_same
<
T
,
float
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
))
||
(
is_same
<
T
,
int8_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
))
||
(
is_same
<
T
,
half_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
))
||
(
is_same
<
T
,
half_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
))
||
(
is_same
<
T
,
half2_t
>::
value
&&
(
N
==
1
))
||
(
is_same
<
T
,
half2_t
>::
value
&&
(
N
==
1
))
||
(
is_same
<
T
,
half4_t
>::
value
&&
(
N
==
1
))
||
(
is_same
<
T
,
half4_t
>::
value
&&
(
N
==
1
))
||
...
@@ -334,38 +306,6 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource,
...
@@ -334,38 +306,6 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
+
4
*
sizeof
(
int32_t
),
src_wave_addr_offset
+
4
*
sizeof
(
int32_t
),
0
);
0
);
return
tmp
.
Vector
();
}
}
else
if
constexpr
(
is_same
<
T
,
int8_t
>::
value
)
{
if
constexpr
(
N
==
1
)
{
return
__llvm_amdgcn_raw_buffer_load_i8
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
0
);
}
else
if
constexpr
(
N
==
2
)
{
return
__llvm_amdgcn_raw_buffer_load_i8x2
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
0
);
}
else
if
constexpr
(
N
==
4
)
{
return
__llvm_amdgcn_raw_buffer_load_i8x4
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
0
);
}
else
if
constexpr
(
N
==
8
)
{
vector_type
<
int8_t
,
8
>
tmp
;
tmp
.
Vectors
(
Number
<
4
>
{})(
Number
<
0
>
{})
=
__llvm_amdgcn_raw_buffer_load_i8x4
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
0
);
tmp
.
Vectors
(
Number
<
4
>
{})(
Number
<
1
>
{})
=
__llvm_amdgcn_raw_buffer_load_i8x4
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
+
4
*
sizeof
(
int8_t
),
0
);
return
tmp
.
Vector
();
return
tmp
.
Vector
();
}
}
...
...
composable_kernel/include/utility/config.amd.hpp.in
View file @
c03945f0
...
@@ -14,11 +14,11 @@
...
@@ -14,11 +14,11 @@
#define CK_DEVICE_BACKEND_AMD 1
#define CK_DEVICE_BACKEND_AMD 1
// GPU ID
// GPU ID
#if
1
#if
0
#define CK_AMD_GPU_GFX906 1
#define CK_AMD_GPU_GFX906 1
#elif 0
#elif 0
#define CK_AMD_GPU_GFX908 1
#define CK_AMD_GPU_GFX908 1
#elif
0
#elif
1
#define CK_AMD_GPU_GFX1030 1
#define CK_AMD_GPU_GFX1030 1
#endif
#endif
...
@@ -88,7 +88,7 @@
...
@@ -88,7 +88,7 @@
// experimental implementation
// experimental implementation
#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
0
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
1
#endif
#endif
#ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
#ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
...
...
composable_kernel/include/utility/float_type.amd.hpp.in
View file @
c03945f0
...
@@ -437,7 +437,6 @@ struct vector_type<int8_t, 16>
...
@@ -437,7 +437,6 @@ struct vector_type<int8_t, 16>
// i8
// i8
// hack for int8x4_t, because compiler does not have native support for int8x4_t
// hack for int8x4_t, because compiler does not have native support for int8x4_t
// int8x4_t is defined as int32_t
// int8x4_t is defined as int32_t
using int8x2_t = typename vector_type<int8_t, 2>::type;
using int8x4_t = typename vector_type<int8_t, 4>::type;
using int8x4_t = typename vector_type<int8_t, 4>::type;
using int8x8_t = typename vector_type<int8_t, 8>::type;
using int8x8_t = typename vector_type<int8_t, 8>::type;
using int8x16_t = typename vector_type<int8_t, 16>::type;
using int8x16_t = typename vector_type<int8_t, 16>::type;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment