Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
ec17a109
Commit
ec17a109
authored
Jul 29, 2020
by
Chao Liu
Browse files
use uint32_t for addr shift in buffer addressing
parent
435f5f91
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
66 additions
and
53 deletions
+66
-53
composable_kernel/include/utility/amd_buffer_addressing.hpp
composable_kernel/include/utility/amd_buffer_addressing.hpp
+66
-53
No files found.
composable_kernel/include/utility/amd_buffer_addressing.hpp
View file @
ec17a109
...
@@ -196,6 +196,7 @@ __device__ float amd_buffer_load<float, 1>(const float* p_src_wave,
...
@@ -196,6 +196,7 @@ __device__ float amd_buffer_load<float, 1>(const float* p_src_wave,
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
#if 1 // debug
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return
__llvm_amdgcn_buffer_load_f32
(
src_wave_buffer_resource
.
data
,
return
__llvm_amdgcn_buffer_load_f32
(
src_wave_buffer_resource
.
data
,
0
,
0
,
...
@@ -204,10 +205,16 @@ __device__ float amd_buffer_load<float, 1>(const float* p_src_wave,
...
@@ -204,10 +205,16 @@ __device__ float amd_buffer_load<float, 1>(const float* p_src_wave,
false
,
false
,
false
);
false
);
#else
#else
in
dex
_t
src_addr_
base
=
src_thread_data_valid
?
0
:
0x7fffffff
;
u
in
t32
_t
src_addr_
shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
return
__llvm_amdgcn_buffer_load_f32
(
return
__llvm_amdgcn_buffer_load_f32
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
,
false
,
false
);
src_wave_buffer_resource
.
data
,
0
,
src_addr_shift
+
src_thread_addr_offset
,
false
,
false
);
#endif
#else
return
src_thread_data_valid
?
__llvm_amdgcn_buffer_load_f32
(
src_wave_buffer_resource
.
data
,
0
,
src_thread_addr_offset
,
false
,
false
)
:
0
;
#endif
#endif
}
}
...
@@ -236,10 +243,10 @@ __device__ float2_t amd_buffer_load<float, 2>(const float* p_src_wave,
...
@@ -236,10 +243,10 @@ __device__ float2_t amd_buffer_load<float, 2>(const float* p_src_wave,
false
,
false
,
false
);
false
);
#else
#else
in
dex
_t
src_addr_
base
=
src_thread_data_valid
?
0
:
0x7fffffff
;
u
in
t32
_t
src_addr_
shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
return
__llvm_amdgcn_buffer_load_f32x2
(
return
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_
base
+
src_thread_addr_offset
,
false
,
false
);
src_wave_buffer_resource
.
data
,
0
,
src_addr_
shift
+
src_thread_addr_offset
,
false
,
false
);
#endif
#endif
}
}
...
@@ -268,10 +275,10 @@ __device__ float4_t amd_buffer_load<float, 4>(const float* p_src_wave,
...
@@ -268,10 +275,10 @@ __device__ float4_t amd_buffer_load<float, 4>(const float* p_src_wave,
false
,
false
,
false
);
false
);
#else
#else
in
dex
_t
src_addr_
base
=
src_thread_data_valid
?
0
:
0x7fffffff
;
u
in
t32
_t
src_addr_
shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
return
__llvm_amdgcn_buffer_load_f32x4
(
return
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_
base
+
src_thread_addr_offset
,
false
,
false
);
src_wave_buffer_resource
.
data
,
0
,
src_addr_
shift
+
src_thread_addr_offset
,
false
,
false
);
#endif
#endif
}
}
...
@@ -301,10 +308,10 @@ __device__ half_t amd_buffer_load<half_t, 1>(const half_t* p_src_wave,
...
@@ -301,10 +308,10 @@ __device__ half_t amd_buffer_load<half_t, 1>(const half_t* p_src_wave,
false
,
false
,
false
);
false
);
#else
#else
in
dex
_t
src_addr_
base
=
src_thread_data_valid
?
0
:
0x7fffffff
;
u
in
t32
_t
src_addr_
shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
return
__llvm_amdgcn_buffer_load_f16
(
return
__llvm_amdgcn_buffer_load_f16
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_
base
+
src_thread_addr_offset
,
false
,
false
);
src_wave_buffer_resource
.
data
,
0
,
src_addr_
shift
+
src_thread_addr_offset
,
false
,
false
);
#endif
#endif
#else
#else
return
src_thread_data_valid
?
p_src_wave
[
src_thread_data_offset
]
:
0
;
return
src_thread_data_valid
?
p_src_wave
[
src_thread_data_offset
]
:
0
;
...
@@ -336,10 +343,10 @@ __device__ half2_t amd_buffer_load<half_t, 2>(const half_t* p_src_wave,
...
@@ -336,10 +343,10 @@ __device__ half2_t amd_buffer_load<half_t, 2>(const half_t* p_src_wave,
false
,
false
,
false
);
false
);
#else
#else
in
dex
_t
src_addr_
base
=
src_thread_data_valid
?
0
:
0x7fffffff
;
u
in
t32
_t
src_addr_
shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
float
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32
(
float
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_
base
+
src_thread_addr_offset
,
false
,
false
);
src_wave_buffer_resource
.
data
,
0
,
src_addr_
shift
+
src_thread_addr_offset
,
false
,
false
);
#endif
#endif
return
*
reinterpret_cast
<
half2_t
*>
(
&
dst_out_tmp
);
return
*
reinterpret_cast
<
half2_t
*>
(
&
dst_out_tmp
);
...
@@ -370,10 +377,10 @@ __device__ half4_t amd_buffer_load<half_t, 4>(const half_t* p_src_wave,
...
@@ -370,10 +377,10 @@ __device__ half4_t amd_buffer_load<half_t, 4>(const half_t* p_src_wave,
false
,
false
,
false
);
false
);
#else
#else
in
dex
_t
src_addr_
base
=
src_thread_data_valid
?
0
:
0x7fffffff
;
u
in
t32
_t
src_addr_
shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
float2_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x2
(
float2_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_
base
+
src_thread_addr_offset
,
false
,
false
);
src_wave_buffer_resource
.
data
,
0
,
src_addr_
shift
+
src_thread_addr_offset
,
false
,
false
);
#endif
#endif
return
*
reinterpret_cast
<
half4_t
*>
(
&
dst_out_tmp
);
return
*
reinterpret_cast
<
half4_t
*>
(
&
dst_out_tmp
);
...
@@ -404,10 +411,10 @@ __device__ half8_t amd_buffer_load<half_t, 8>(const half_t* p_src_wave,
...
@@ -404,10 +411,10 @@ __device__ half8_t amd_buffer_load<half_t, 8>(const half_t* p_src_wave,
false
,
false
,
false
);
false
);
#else
#else
in
dex
_t
src_addr_
base
=
src_thread_data_valid
?
0
:
0x7fffffff
;
u
in
t32
_t
src_addr_
shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_
base
+
src_thread_addr_offset
,
false
,
false
);
src_wave_buffer_resource
.
data
,
0
,
src_addr_
shift
+
src_thread_addr_offset
,
false
,
false
);
#endif
#endif
return
*
reinterpret_cast
<
half8_t
*>
(
&
dst_out_tmp
);
return
*
reinterpret_cast
<
half8_t
*>
(
&
dst_out_tmp
);
...
@@ -439,10 +446,10 @@ __device__ ushort amd_buffer_load<ushort, 1>(const ushort* p_src_wave,
...
@@ -439,10 +446,10 @@ __device__ ushort amd_buffer_load<ushort, 1>(const ushort* p_src_wave,
false
,
false
,
false
);
false
);
#else
#else
in
dex
_t
src_addr_
base
=
src_thread_data_valid
?
0
:
0x7fffffff
;
u
in
t32
_t
src_addr_
shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
return
__llvm_amdgcn_buffer_load_bf16
(
return
__llvm_amdgcn_buffer_load_bf16
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_
base
+
src_thread_addr_offset
,
false
,
false
);
src_wave_buffer_resource
.
data
,
0
,
src_addr_
shift
+
src_thread_addr_offset
,
false
,
false
);
#endif
#endif
#else
#else
...
@@ -475,10 +482,10 @@ __device__ ushort2_t amd_buffer_load<ushort, 2>(const ushort* p_src_wave,
...
@@ -475,10 +482,10 @@ __device__ ushort2_t amd_buffer_load<ushort, 2>(const ushort* p_src_wave,
false
,
false
,
false
);
false
);
#else
#else
in
dex
_t
src_addr_
base
=
src_thread_data_valid
?
0
:
0x7fffffff
;
u
in
t32
_t
src_addr_
shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
float
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32
(
float
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_
base
+
src_thread_addr_offset
,
false
,
false
);
src_wave_buffer_resource
.
data
,
0
,
src_addr_
shift
+
src_thread_addr_offset
,
false
,
false
);
#endif
#endif
return
*
reinterpret_cast
<
ushort2_t
*>
(
&
dst_out_tmp
);
return
*
reinterpret_cast
<
ushort2_t
*>
(
&
dst_out_tmp
);
...
@@ -509,10 +516,10 @@ __device__ ushort4_t amd_buffer_load<ushort, 4>(const ushort* p_src_wave,
...
@@ -509,10 +516,10 @@ __device__ ushort4_t amd_buffer_load<ushort, 4>(const ushort* p_src_wave,
false
,
false
,
false
);
false
);
#else
#else
in
dex
_t
src_addr_
base
=
src_thread_data_valid
?
0
:
0x7fffffff
;
u
in
t32
_t
src_addr_
shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
float2_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x2
(
float2_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_
base
+
src_thread_addr_offset
,
false
,
false
);
src_wave_buffer_resource
.
data
,
0
,
src_addr_
shift
+
src_thread_addr_offset
,
false
,
false
);
#endif
#endif
return
*
reinterpret_cast
<
ushort4_t
*>
(
&
dst_out_tmp
);
return
*
reinterpret_cast
<
ushort4_t
*>
(
&
dst_out_tmp
);
...
@@ -543,10 +550,10 @@ __device__ ushort8_t amd_buffer_load<ushort, 8>(const ushort* p_src_wave,
...
@@ -543,10 +550,10 @@ __device__ ushort8_t amd_buffer_load<ushort, 8>(const ushort* p_src_wave,
false
,
false
,
false
);
false
);
#else
#else
in
dex
_t
src_addr_
base
=
src_thread_data_valid
?
0
:
0x7fffffff
;
u
in
t32
_t
src_addr_
shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_
base
+
src_thread_addr_offset
,
false
,
false
);
src_wave_buffer_resource
.
data
,
0
,
src_addr_
shift
+
src_thread_addr_offset
,
false
,
false
);
#endif
#endif
return
*
reinterpret_cast
<
ushort8_t
*>
(
&
dst_out_tmp
);
return
*
reinterpret_cast
<
ushort8_t
*>
(
&
dst_out_tmp
);
...
@@ -570,6 +577,7 @@ __device__ void amd_buffer_store<float, 1>(const float* p_src_thread,
...
@@ -570,6 +577,7 @@ __device__ void amd_buffer_store<float, 1>(const float* p_src_thread,
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
#if 1 // debug
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32
(
*
p_src_thread
,
__llvm_amdgcn_buffer_store_f32
(
*
p_src_thread
,
dst_wave_buffer_resource
.
data
,
dst_wave_buffer_resource
.
data
,
...
@@ -578,15 +586,22 @@ __device__ void amd_buffer_store<float, 1>(const float* p_src_thread,
...
@@ -578,15 +586,22 @@ __device__ void amd_buffer_store<float, 1>(const float* p_src_thread,
false
,
false
,
false
);
false
);
#else
#else
in
dex
_t
dst_addr_
base
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
u
in
t32
_t
dst_addr_
shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32
(
*
p_src_thread
,
__llvm_amdgcn_buffer_store_f32
(
*
p_src_thread
,
dst_wave_buffer_resource
.
data
,
dst_wave_buffer_resource
.
data
,
0
,
0
,
dst_addr_
base
+
dst_thread_addr_offset
,
dst_addr_
shift
+
dst_thread_addr_offset
,
false
,
false
,
false
);
false
);
#endif
#endif
#else
if
(
dst_thread_data_valid
)
{
__llvm_amdgcn_buffer_store_f32
(
*
p_src_thread
,
dst_wave_buffer_resource
.
data
,
0
,
dst_thread_addr_offset
,
false
,
false
);
}
#endif
}
}
template
<
>
template
<
>
...
@@ -615,12 +630,12 @@ __device__ void amd_buffer_store<float, 2>(const float* p_src_thread,
...
@@ -615,12 +630,12 @@ __device__ void amd_buffer_store<float, 2>(const float* p_src_thread,
false
,
false
,
false
);
false
);
#else
#else
in
dex
_t
dst_addr_
base
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
u
in
t32
_t
dst_addr_
shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x2
(
*
reinterpret_cast
<
const
float2_t
*>
(
p_src_thread
),
__llvm_amdgcn_buffer_store_f32x2
(
*
reinterpret_cast
<
const
float2_t
*>
(
p_src_thread
),
dst_wave_buffer_resource
.
data
,
dst_wave_buffer_resource
.
data
,
0
,
0
,
dst_addr_
base
+
dst_thread_addr_offset
,
dst_addr_
shift
+
dst_thread_addr_offset
,
false
,
false
,
false
);
false
);
#endif
#endif
...
@@ -652,12 +667,12 @@ __device__ void amd_buffer_store<float, 4>(const float* p_src_thread,
...
@@ -652,12 +667,12 @@ __device__ void amd_buffer_store<float, 4>(const float* p_src_thread,
false
,
false
,
false
);
false
);
#else
#else
in
dex
_t
dst_addr_
base
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
u
in
t32
_t
dst_addr_
shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x4
(
*
reinterpret_cast
<
const
float4_t
*>
(
p_src_thread
),
__llvm_amdgcn_buffer_store_f32x4
(
*
reinterpret_cast
<
const
float4_t
*>
(
p_src_thread
),
dst_wave_buffer_resource
.
data
,
dst_wave_buffer_resource
.
data
,
0
,
0
,
dst_addr_
base
+
dst_thread_addr_offset
,
dst_addr_
shift
+
dst_thread_addr_offset
,
false
,
false
,
false
);
false
);
#endif
#endif
...
@@ -690,12 +705,12 @@ __device__ void amd_buffer_store<half_t, 1>(const half_t* p_src_thread,
...
@@ -690,12 +705,12 @@ __device__ void amd_buffer_store<half_t, 1>(const half_t* p_src_thread,
false
,
false
,
false
);
false
);
#else
#else
in
dex
_t
dst_addr_
base
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
u
in
t32
_t
dst_addr_
shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f16
(
*
p_src_thread
,
__llvm_amdgcn_buffer_store_f16
(
*
p_src_thread
,
dst_wave_buffer_resource
.
data
,
dst_wave_buffer_resource
.
data
,
0
,
0
,
dst_addr_
base
+
dst_thread_addr_offset
,
dst_addr_
shift
+
dst_thread_addr_offset
,
false
,
false
,
false
);
false
);
#endif
#endif
...
@@ -736,12 +751,12 @@ __device__ void amd_buffer_store<half_t, 2>(const half_t* p_src_thread,
...
@@ -736,12 +751,12 @@ __device__ void amd_buffer_store<half_t, 2>(const half_t* p_src_thread,
false
,
false
,
false
);
false
);
#else
#else
in
dex
_t
dst_addr_
base
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
u
in
t32
_t
dst_addr_
shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32
(
*
p_src_tmp
,
__llvm_amdgcn_buffer_store_f32
(
*
p_src_tmp
,
dst_wave_buffer_resource
.
data
,
dst_wave_buffer_resource
.
data
,
0
,
0
,
dst_addr_
base
+
dst_thread_addr_offset
,
dst_addr_
shift
+
dst_thread_addr_offset
,
false
,
false
,
false
);
false
);
#endif
#endif
...
@@ -771,18 +786,16 @@ __device__ void amd_buffer_store<half_t, 4>(const half_t* p_src_thread,
...
@@ -771,18 +786,16 @@ __device__ void amd_buffer_store<half_t, 4>(const half_t* p_src_thread,
__llvm_amdgcn_buffer_store_f32x2
(
*
p_src_tmp
,
__llvm_amdgcn_buffer_store_f32x2
(
*
p_src_tmp
,
dst_wave_buffer_resource
.
data
,
dst_wave_buffer_resource
.
data
,
0
,
0
,
dst_thread_data_valid
dst_thread_data_valid
?
dst_thread_addr_offset
:
0xffffffff
,
?
dst_thread_addr_offset
,
:
0xffffffff
,
false
,
false
,
false
);
false
);
#else
#else
in
dex
_t
dst_addr_
base
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
u
in
t32
_t
dst_addr_
shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x2
(
*
p_src_tmp
,
__llvm_amdgcn_buffer_store_f32x2
(
*
p_src_tmp
,
dst_wave_buffer_resource
.
data
,
dst_wave_buffer_resource
.
data
,
0
,
0
,
dst_addr_
base
+
dst_thread_addr_offset
,
dst_addr_
shift
+
dst_thread_addr_offset
,
false
,
false
,
false
);
false
);
#endif
#endif
...
@@ -816,12 +829,12 @@ __device__ void amd_buffer_store<half_t, 8>(const half_t* p_src_thread,
...
@@ -816,12 +829,12 @@ __device__ void amd_buffer_store<half_t, 8>(const half_t* p_src_thread,
false
,
false
,
false
);
false
);
#else
#else
in
dex
_t
dst_addr_
base
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
u
in
t32
_t
dst_addr_
shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x4
(
*
p_src_tmp
,
__llvm_amdgcn_buffer_store_f32x4
(
*
p_src_tmp
,
dst_wave_buffer_resource
.
data
,
dst_wave_buffer_resource
.
data
,
0
,
0
,
dst_addr_
base
+
dst_thread_addr_offset
,
dst_addr_
shift
+
dst_thread_addr_offset
,
false
,
false
,
false
);
false
);
#endif
#endif
...
@@ -854,12 +867,12 @@ __device__ void amd_buffer_store<ushort, 1>(const ushort* p_src_thread,
...
@@ -854,12 +867,12 @@ __device__ void amd_buffer_store<ushort, 1>(const ushort* p_src_thread,
false
,
false
,
false
);
false
);
#else
#else
in
dex
_t
dst_addr_
base
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
u
in
t32
_t
dst_addr_
shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_bf16
(
*
p_src_thread
,
__llvm_amdgcn_buffer_store_bf16
(
*
p_src_thread
,
dst_wave_buffer_resource
.
data
,
dst_wave_buffer_resource
.
data
,
0
,
0
,
dst_addr_
base
+
dst_thread_addr_offset
,
dst_addr_
shift
+
dst_thread_addr_offset
,
false
,
false
,
false
);
false
);
#endif
#endif
...
@@ -900,12 +913,12 @@ __device__ void amd_buffer_store<ushort, 2>(const ushort* p_src_thread,
...
@@ -900,12 +913,12 @@ __device__ void amd_buffer_store<ushort, 2>(const ushort* p_src_thread,
false
,
false
,
false
);
false
);
#else
#else
in
dex
_t
dst_addr_
base
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
u
in
t32
_t
dst_addr_
shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32
(
*
p_src_tmp
,
__llvm_amdgcn_buffer_store_f32
(
*
p_src_tmp
,
dst_wave_buffer_resource
.
data
,
dst_wave_buffer_resource
.
data
,
0
,
0
,
dst_addr_
base
+
dst_thread_addr_offset
,
dst_addr_
shift
+
dst_thread_addr_offset
,
false
,
false
,
false
);
false
);
#endif
#endif
...
@@ -939,12 +952,12 @@ __device__ void amd_buffer_store<ushort, 4>(const ushort* p_src_thread,
...
@@ -939,12 +952,12 @@ __device__ void amd_buffer_store<ushort, 4>(const ushort* p_src_thread,
false
,
false
,
false
);
false
);
#else
#else
in
dex
_t
dst_addr_
base
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
u
in
t32
_t
dst_addr_
shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x2
(
*
p_src_tmp
,
__llvm_amdgcn_buffer_store_f32x2
(
*
p_src_tmp
,
dst_wave_buffer_resource
.
data
,
dst_wave_buffer_resource
.
data
,
0
,
0
,
dst_addr_
base
+
dst_thread_addr_offset
,
dst_addr_
shift
+
dst_thread_addr_offset
,
false
,
false
,
false
);
false
);
#endif
#endif
...
@@ -978,12 +991,12 @@ __device__ void amd_buffer_store<ushort, 8>(const ushort* p_src_thread,
...
@@ -978,12 +991,12 @@ __device__ void amd_buffer_store<ushort, 8>(const ushort* p_src_thread,
false
,
false
,
false
);
false
);
#else
#else
in
dex
_t
dst_addr_
base
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
u
in
t32
_t
dst_addr_
shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x4
(
*
p_src_tmp
,
__llvm_amdgcn_buffer_store_f32x4
(
*
p_src_tmp
,
dst_wave_buffer_resource
.
data
,
dst_wave_buffer_resource
.
data
,
0
,
0
,
dst_addr_
base
+
dst_thread_addr_offset
,
dst_addr_
shift
+
dst_thread_addr_offset
,
false
,
false
,
false
);
false
);
#endif
#endif
...
@@ -1014,12 +1027,12 @@ __device__ void amd_buffer_atomic_add<float, 1>(const float* p_src_thread,
...
@@ -1014,12 +1027,12 @@ __device__ void amd_buffer_atomic_add<float, 1>(const float* p_src_thread,
dst_thread_data_valid
?
dst_thread_addr_offset
:
0xffffffff
,
dst_thread_data_valid
?
dst_thread_addr_offset
:
0xffffffff
,
false
);
false
);
#else
#else
in
dex
_t
dst_addr_
base
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
u
in
t32
_t
dst_addr_
shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_atomic_add_f32
(
*
p_src_thread
,
__llvm_amdgcn_buffer_atomic_add_f32
(
*
p_src_thread
,
dst_wave_buffer_resource
.
data
,
dst_wave_buffer_resource
.
data
,
0
,
0
,
dst_addr_
base
+
dst_thread_addr_offset
,
dst_addr_
shift
+
dst_thread_addr_offset
,
false
);
false
);
#endif
#endif
}
}
...
@@ -1053,14 +1066,14 @@ __device__ void amd_buffer_atomic_add<float, 2>(const float* p_src_thread,
...
@@ -1053,14 +1066,14 @@ __device__ void amd_buffer_atomic_add<float, 2>(const float* p_src_thread,
false
);
false
);
}
}
#else
#else
in
dex
_t
dst_addr_
base
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
u
in
t32
_t
dst_addr_
shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
for
(
index_t
i
=
0
;
i
<
2
;
++
i
)
for
(
index_t
i
=
0
;
i
<
2
;
++
i
)
{
{
__llvm_amdgcn_buffer_atomic_add_f32
(
p_src_thread
[
i
],
__llvm_amdgcn_buffer_atomic_add_f32
(
p_src_thread
[
i
],
dst_wave_buffer_resource
.
data
,
dst_wave_buffer_resource
.
data
,
0
,
0
,
dst_addr_
base
+
dst_thread_addr_offset
+
dst_addr_
shift
+
dst_thread_addr_offset
+
i
*
sizeof
(
float
),
i
*
sizeof
(
float
),
false
);
false
);
}
}
...
@@ -1096,14 +1109,14 @@ __device__ void amd_buffer_atomic_add<float, 4>(const float* p_src_thread,
...
@@ -1096,14 +1109,14 @@ __device__ void amd_buffer_atomic_add<float, 4>(const float* p_src_thread,
false
);
false
);
}
}
#else
#else
in
dex
_t
dst_addr_
base
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
u
in
t32
_t
dst_addr_
shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
for
(
index_t
i
=
0
;
i
<
4
;
++
i
)
for
(
index_t
i
=
0
;
i
<
4
;
++
i
)
{
{
__llvm_amdgcn_buffer_atomic_add_f32
(
p_src_thread
[
i
],
__llvm_amdgcn_buffer_atomic_add_f32
(
p_src_thread
[
i
],
dst_wave_buffer_resource
.
data
,
dst_wave_buffer_resource
.
data
,
0
,
0
,
dst_addr_
base
+
dst_thread_addr_offset
+
dst_addr_
shift
+
dst_thread_addr_offset
+
i
*
sizeof
(
float
),
i
*
sizeof
(
float
),
false
);
false
);
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment