Commit d8cf5e5a authored by Chao Liu's avatar Chao Liu
Browse files

fix

parent ec17a109
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
namespace ck { namespace ck {
#if MIOPEN_USE_FP32
// outer-product: c[i,j] += inner_product(a[i], b[j]) // outer-product: c[i,j] += inner_product(a[i], b[j])
__device__ void amd_assembly_outer_product_1x2(float a, float b0, float b1, float& c0, float& c1) __device__ void amd_assembly_outer_product_1x2(float a, float b0, float b1, float& c0, float& c1)
{ {
...@@ -29,7 +30,9 @@ __device__ void amd_assembly_outer_product_1x4( ...@@ -29,7 +30,9 @@ __device__ void amd_assembly_outer_product_1x4(
: "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3) : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
: "v"(a), "v"(b0), "v"(b1), "v"(b2), "v"(b3), "0"(c0), "1"(c1), "2"(c2), "3"(c3)); : "v"(a), "v"(b0), "v"(b1), "v"(b2), "v"(b3), "0"(c0), "1"(c1), "2"(c2), "3"(c3));
} }
#endif
#if MIOPEN_USE_FP16
// outer-product: c[i,j] += inner_product(a[i], b[j]) // outer-product: c[i,j] += inner_product(a[i], b[j])
__device__ void __device__ void
amd_assembly_outer_product_1x2(half2_t a, half2_t b0, half2_t b1, float& c0, float& c1) amd_assembly_outer_product_1x2(half2_t a, half2_t b0, half2_t b1, float& c0, float& c1)
...@@ -145,6 +148,7 @@ __device__ void amd_assembly_outer_product_1x4(half4_t a, ...@@ -145,6 +148,7 @@ __device__ void amd_assembly_outer_product_1x4(half4_t a,
"2"(c2), "2"(c2),
"3"(c3)); // 3rd Src Acc registers for 2 half2 registers "3"(c3)); // 3rd Src Acc registers for 2 half2 registers
} }
#endif
} // namespace ck } // namespace ck
#endif #endif
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#if CK_USE_AMD_XDLOPS #if CK_USE_AMD_XDLOPS
#include "amd_xdlops.hpp" #include "amd_xdlops.hpp"
#include "amd_xdlops_inline_asm.hpp"
#endif #endif
#endif #endif
...@@ -145,8 +145,9 @@ struct AtomicAddData ...@@ -145,8 +145,9 @@ struct AtomicAddData
template <> template <>
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src, __device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
index_t src_offset, index_t src_offset,
bool src_valid,
index_t /* src_range */, index_t /* src_range */,
bool src_valid T* p_dst, T* p_dst,
index_t dst_offset, index_t dst_offset,
bool dst_valid, bool dst_valid,
index_t dst_range) const index_t dst_range) const
...@@ -157,7 +158,7 @@ struct AtomicAddData ...@@ -157,7 +158,7 @@ struct AtomicAddData
p_dst, p_dst,
dst_offset, dst_offset,
dst_valid, dst_valid,
index_t dst_range); dst_range);
} }
#endif #endif
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment