Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
3a44a469
"...composable_kernel-1.git" did not exist on "b3e8d57d51300b88b591900621f71b6a1b3a7acc"
Commit
3a44a469
authored
Oct 15, 2020
by
Chao Liu
Browse files
fix bug for miopen
parent
9d000309
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
163 additions
and
9 deletions
+163
-9
composable_kernel/include/utility/amd_buffer_addressing.hpp
composable_kernel/include/utility/amd_buffer_addressing.hpp
+6
-6
composable_kernel/include/utility/common_header.hpp
composable_kernel/include/utility/common_header.hpp
+1
-2
composable_kernel/include/utility/config.amd.hpp.in
composable_kernel/include/utility/config.amd.hpp.in
+8
-0
composable_kernel/include/utility/float_type.amd.hpp.in
composable_kernel/include/utility/float_type.amd.hpp.in
+147
-0
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
...ble_kernel/include/utility/in_memory_operation.amd.hpp.in
+1
-1
No files found.
composable_kernel/include/utility/amd_buffer_addressing.hpp
View file @
3a44a469
...
@@ -246,14 +246,14 @@ __device__ half_t amd_buffer_load<half_t, 1>(const half_t* p_src_wave,
...
@@ -246,14 +246,14 @@ __device__ half_t amd_buffer_load<half_t, 1>(const half_t* p_src_wave,
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
// everything is passed to Voffset
// everything is passed to Voffset
return
__llvm_amdgcn_raw_buffer_load_f16
(
return
__llvm_amdgcn_raw_buffer_load_f16
(
src_wave_buffer_resource
.
data
,
src_addr_shift
+
src_thread_
data
_offset
,
0
,
0
);
src_wave_buffer_resource
.
data
,
src_addr_shift
+
src_thread_
addr
_offset
,
0
,
0
);
#else
#else
half_t
zero
(
0
);
half_t
zero
(
0
);
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
// everything is passed to Voffset
// everything is passed to Voffset
return
src_thread_data_valid
?
__llvm_amdgcn_raw_buffer_load_f16
(
return
src_thread_data_valid
?
__llvm_amdgcn_raw_buffer_load_f16
(
src_wave_buffer_resource
.
data
,
src_thread_
data
_offset
,
0
,
0
)
src_wave_buffer_resource
.
data
,
src_thread_
addr
_offset
,
0
,
0
)
:
zero
;
:
zero
;
#endif // CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#endif // CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
}
}
...
@@ -356,7 +356,7 @@ __device__ half8_t amd_buffer_load<half_t, 8>(const half_t* p_src_wave,
...
@@ -356,7 +356,7 @@ __device__ half8_t amd_buffer_load<half_t, 8>(const half_t* p_src_wave,
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_buffer_resource
.
data
,
0
,
src_thread_addr_offset
,
false
,
false
);
src_wave_buffer_resource
.
data
,
0
,
src_thread_addr_offset
,
false
,
false
);
return
src_thread_data_
offset
?
*
reinterpret_cast
<
half8_t
*>
(
&
dst_out_tmp
)
:
zeros
;
return
src_thread_data_
valid
?
*
reinterpret_cast
<
half8_t
*>
(
&
dst_out_tmp
)
:
zeros
;
#endif
#endif
}
}
...
@@ -385,7 +385,7 @@ __device__ ushort amd_buffer_load<ushort, 1>(const ushort* p_src_wave,
...
@@ -385,7 +385,7 @@ __device__ ushort amd_buffer_load<ushort, 1>(const ushort* p_src_wave,
return
__llvm_amdgcn_raw_buffer_load_bf16
(
return
__llvm_amdgcn_raw_buffer_load_bf16
(
src_wave_buffer_resource
.
data
,
src_addr_shift
+
src_thread_addr_offset
,
0
,
0
);
src_wave_buffer_resource
.
data
,
src_addr_shift
+
src_thread_addr_offset
,
0
,
0
);
#else
#else
ushort
_t
zero
(
0
);
ushort
zero
(
0
);
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
// everything is passed to Voffset
// everything is passed to Voffset
...
@@ -493,7 +493,7 @@ __device__ ushort8_t amd_buffer_load<ushort, 8>(const ushort* p_src_wave,
...
@@ -493,7 +493,7 @@ __device__ ushort8_t amd_buffer_load<ushort, 8>(const ushort* p_src_wave,
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_buffer_resource
.
data
,
0
,
src_thread_addr_offset
,
false
,
false
);
src_wave_buffer_resource
.
data
,
0
,
src_thread_addr_offset
,
false
,
false
);
return
src_thread_data_
offset
?
*
reinterpret_cast
<
ushort8_t
*>
(
&
dst_out_tmp
)
:
zeros
;
return
src_thread_data_
valid
?
*
reinterpret_cast
<
ushort8_t
*>
(
&
dst_out_tmp
)
:
zeros
;
#endif
#endif
}
}
...
@@ -969,9 +969,9 @@ __device__ void amd_buffer_atomic_add<float, 2>(const float* p_src_thread,
...
@@ -969,9 +969,9 @@ __device__ void amd_buffer_atomic_add<float, 2>(const float* p_src_thread,
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
for
(
index_t
i
=
0
;
i
<
2
;
++
i
)
for
(
index_t
i
=
0
;
i
<
2
;
++
i
)
{
{
__llvm_amdgcn_buffer_atomic_add_f32
(
p_src_thread
[
i
],
__llvm_amdgcn_buffer_atomic_add_f32
(
p_src_thread
[
i
],
...
...
composable_kernel/include/utility/common_header.hpp
View file @
3a44a469
#ifndef CK_COMMON_HEADER_HPP
#ifndef CK_COMMON_HEADER_HPP
#define CK_COMMON_HEADER_HPP
#define CK_COMMON_HEADER_HPP
#include "config.hpp"
#include "array.hpp"
#include "array.hpp"
#include "container_helper.hpp"
#include "container_helper.hpp"
#include "statically_indexed_array.hpp"
#include "statically_indexed_array.hpp"
#include "container_element_picker.hpp"
#include "container_element_picker.hpp"
#include "config.hpp"
#include "float_type.hpp"
#include "float_type.hpp"
#include "functional.hpp"
#include "functional.hpp"
#include "functional2.hpp"
#include "functional2.hpp"
...
@@ -25,7 +25,6 @@
...
@@ -25,7 +25,6 @@
#if CK_USE_AMD_INLINE_ASM
#if CK_USE_AMD_INLINE_ASM
#include "amd_inline_asm.hpp"
#include "amd_inline_asm.hpp"
#include "amd_llvm_intrinsic.hpp"
#endif
#endif
#if CK_USE_AMD_XDLOPS
#if CK_USE_AMD_XDLOPS
...
...
composable_kernel/include/utility/config.amd.hpp.in
View file @
3a44a469
#ifndef CK_CONFIG_AMD_HPP
#ifndef CK_CONFIG_AMD_HPP
#define CK_CONFIG_AMD_HPP
#define CK_CONFIG_AMD_HPP
#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
#include "hip/hip_runtime.h"
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#include "hip/hip_fp16.h"
#endif
#include "bfloat16_dev.hpp"
#include "bfloat16_dev.hpp"
#ifndef CK_HIP_VERSION_FLAT
#ifndef CK_HIP_VERSION_FLAT
...
@@ -78,6 +80,12 @@
...
@@ -78,6 +80,12 @@
#define CK_WORKAROUND_SWDEV_229564 1
#define CK_WORKAROUND_SWDEV_229564 1
#endif
#endif
// workaround for accvgpr over-allocation
#ifndef CK_WORKAROUND_SWDEV_241664
#define CK_WORKAROUND_SWDEV_241664 1
#endif
namespace ck {
namespace ck {
enum AddressSpace
enum AddressSpace
...
...
composable_kernel/include/utility/float_type.amd.hpp.in
View file @
3a44a469
...
@@ -21,6 +21,153 @@ typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
...
@@ -21,6 +21,153 @@ typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
typedef ushort ushort8_t __attribute__((ext_vector_type(8)));
typedef ushort ushort8_t __attribute__((ext_vector_type(8)));
struct c_vec32_4_t
{
union VecType
{
struct
{
float32_t x;
float32_t y;
float32_t z;
float32_t w;
} s;
float n[128];
};
__host__ __device__ static VecType CreateVecZero()
{
VecType c;
c.s.x = 0;
c.s.y = 0;
c.s.z = 0;
c.s.w = 0;
return c;
}
};
struct c_vec32_2_t
{
union VecType
{
struct
{
float32_t x;
float32_t y;
} s;
float n[64];
} l;
__host__ __device__ static VecType CreateVecZero()
{
VecType c;
c.s.x = 0;
c.s.y = 0;
return c;
}
};
struct c_vec32_2_2_t
{
union VecType
{
struct
{
c_vec32_2_t x;
c_vec32_2_t y;
} s;
float n[128];
};
__host__ __device__ static VecType CreateVecZero()
{
VecType c;
c.s.x.l.s.x = 0;
c.s.x.l.s.y = 0;
c.s.y.l.s.x = 0;
c.s.y.l.s.y = 0;
return c;
}
};
struct c_vec32_1_t
{
union VecType
{
struct
{
float32_t x;
} s;
float n[32];
};
__host__ __device__ static VecType CreateVecZero()
{
VecType c;
c.s.x = 0;
return c;
}
};
struct c_vec16_1_t
{
union VecType
{
struct
{
float16_t x;
} s;
float n[16];
};
__host__ __device__ static VecType CreateVecZero()
{
VecType c;
c.s.x = 0;
return c;
}
};
struct c_vec4_2_t
{
union VecType
{
struct
{
float4_t x;
float4_t y;
} s;
float n[8];
};
__host__ __device__ static VecType CreateVecZero()
{
VecType c;
c.s.x = 0;
c.s.y = 0;
return c;
}
};
struct c_vec4_1_t
{
union VecType
{
struct
{
float4_t x;
} s;
float n[4];
};
__host__ __device__ static VecType CreateVecZero()
{
VecType c;
c.s.x = 0;
return c;
}
};
template <class T, index_t N>
template <class T, index_t N>
struct vector_type
struct vector_type
{
{
...
...
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
View file @
3a44a469
...
@@ -141,7 +141,7 @@ struct AtomicAddData
...
@@ -141,7 +141,7 @@ struct AtomicAddData
}
}
}
}
#if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_ADD
#if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_
F
ADD
// buffer_atomic requires:
// buffer_atomic requires:
// 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
// 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
// 2) p_dst_thread to be a wavewise pointer.
// 2) p_dst_thread to be a wavewise pointer.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment