Commit f5654649 authored by Chao Liu's avatar Chao Liu
Browse files

added back amd_assembly_outer_product_1x2 and amd_assembly_outer_product_1x4

parent 9d5d6afa
...@@ -212,6 +212,8 @@ struct ThreadwiseGemm_km_kn_mn_v1r1 ...@@ -212,6 +212,8 @@ struct ThreadwiseGemm_km_kn_mn_v1r1
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto M = CDesc{}.GetLength(I0); constexpr auto M = CDesc{}.GetLength(I0);
constexpr auto N = CDesc{}.GetLength(I1); constexpr auto N = CDesc{}.GetLength(I1);
...@@ -223,24 +225,65 @@ struct ThreadwiseGemm_km_kn_mn_v1r1 ...@@ -223,24 +225,65 @@ struct ThreadwiseGemm_km_kn_mn_v1r1
static_for<0, K, 1>{}([&](auto k) { static_for<0, K, 1>{}([&](auto k) {
static_for<0, M, 1>{}([&](auto m) { static_for<0, M, 1>{}([&](auto m) {
static_for<0, N, 1>{}([&](auto n) { constexpr index_t a_offset = ADesc{}.CalculateOffset(make_tuple(k, m));
if constexpr(N == 2)
{
constexpr index_t b_offset_0 = BDesc{}.CalculateOffset(make_tuple(k, I0));
constexpr index_t b_offset_1 = BDesc{}.CalculateOffset(make_tuple(k, I1));
constexpr index_t a_offset = constexpr index_t c_offset_0 = CDesc{}.CalculateOffset(make_tuple(m, I0));
ADesc{}.CalculateOffset(a_origin_idx + make_tuple(k, m)); constexpr index_t c_offset_1 = CDesc{}.CalculateOffset(make_tuple(m, I1));
constexpr index_t b_offset =
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(k, n)); amd_assembly_outer_product_1x2(a_buf[Number<a_offset>{}],
constexpr index_t c_offset = b_buf[Number<b_offset_0>{}],
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(m, n)); b_buf[Number<b_offset_1>{}],
c_buf(Number<c_offset_0>{}),
c_buf(Number<c_offset_1>{}));
}
else if constexpr(N == 4)
{
constexpr index_t b_offset_0 = BDesc{}.CalculateOffset(make_tuple(k, I0));
constexpr index_t b_offset_1 = BDesc{}.CalculateOffset(make_tuple(k, I1));
constexpr index_t b_offset_2 = BDesc{}.CalculateOffset(make_tuple(k, I2));
constexpr index_t b_offset_3 = BDesc{}.CalculateOffset(make_tuple(k, I3));
constexpr index_t c_offset_0 = CDesc{}.CalculateOffset(make_tuple(m, I0));
constexpr index_t c_offset_1 = CDesc{}.CalculateOffset(make_tuple(m, I1));
constexpr index_t c_offset_2 = CDesc{}.CalculateOffset(make_tuple(m, I2));
constexpr index_t c_offset_3 = CDesc{}.CalculateOffset(make_tuple(m, I3));
amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}],
b_buf[Number<b_offset_0>{}],
b_buf[Number<b_offset_1>{}],
b_buf[Number<b_offset_2>{}],
b_buf[Number<b_offset_3>{}],
c_buf(Number<c_offset_0>{}),
c_buf(Number<c_offset_1>{}),
c_buf(Number<c_offset_2>{}),
c_buf(Number<c_offset_3>{}));
}
else
{
static_for<0, N, 1>{}([&](auto n) {
constexpr index_t a_offset =
ADesc{}.CalculateOffset(a_origin_idx + make_tuple(k, m));
constexpr index_t b_offset =
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(k, n));
constexpr index_t c_offset =
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(m, n));
#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM #if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
amd_assembly_inner_product(a_buf[Number<a_offset>{}], amd_assembly_inner_product(a_buf[Number<a_offset>{}],
b_buf[Number<b_offset>{}], b_buf[Number<b_offset>{}],
c_buf(Number<c_offset>{})); c_buf(Number<c_offset>{}));
#else #else
c_buf(Number<c_offset>{}) += inner_product_with_conversion<FloatC>{}( c_buf(Number<c_offset>{}) += inner_product_with_conversion<FloatC>{}(
a_buf[Number<a_offset>{}], b_buf[Number<b_offset>{}]); a_buf[Number<a_offset>{}], b_buf[Number<b_offset>{}]);
#endif #endif
}); });
}
}); });
}); });
} }
......
...@@ -61,85 +61,74 @@ struct ThreadwiseGemm_km_kn_mn_v3 ...@@ -61,85 +61,74 @@ struct ThreadwiseGemm_km_kn_mn_v3
static_for<0, E, 1>{}([&](auto e) { static_for<0, E, 1>{}([&](auto e) {
static_for<0, K, 1>{}([&](auto k) { static_for<0, K, 1>{}([&](auto k) {
#if 0 constexpr index_t a_offset = ADesc{}.CalculateOffset(make_tuple(e, k));
constexpr auto a_offset = ADesc{}.CalculateOffset(make_tuple(e, k));
if constexpr(H == 2 && W == 2) if constexpr(H == 2 && W == 2)
{ {
constexpr auto b_offset_0 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 0)); constexpr index_t b_offset_0 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 0));
constexpr auto b_offset_1 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 1)); constexpr index_t b_offset_1 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 1));
constexpr auto b_offset_2 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 0)); constexpr index_t b_offset_2 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 0));
constexpr auto b_offset_3 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 1)); constexpr index_t b_offset_3 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 1));
constexpr auto c_offset_0 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 0)); constexpr index_t c_offset_0 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 0));
constexpr auto c_offset_1 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 1)); constexpr index_t c_offset_1 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 1));
constexpr auto c_offset_2 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 0)); constexpr index_t c_offset_2 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 0));
constexpr auto c_offset_3 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 1)); constexpr index_t c_offset_3 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 1));
amd_assembly_outer_product_1x4(p_a[a_offset], amd_assembly_outer_product_1x4(p_a[Number<a_offset>{}],
p_b[b_offset_0], p_b[Number<b_offset_0>{}],
p_b[b_offset_1], p_b[Number<b_offset_1>{}],
p_b[b_offset_2], p_b[Number<b_offset_2>{}],
p_b[b_offset_3], p_b[Number<b_offset_3>{}],
p_c[c_offset_0], p_c[Number<c_offset_0>{}],
p_c[c_offset_1], p_c[Number<c_offset_1>{}],
p_c[c_offset_2], p_c[Number<c_offset_2>{}],
p_c[c_offset_3]); p_c[Number<c_offset_3>{}]);
} }
else if constexpr(H == 4 && W == 1) else if constexpr(H == 4 && W == 1)
{ {
constexpr auto b_offset_0 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 0)); constexpr index_t b_offset_0 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 0));
constexpr auto b_offset_1 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 0)); constexpr index_t b_offset_1 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 0));
constexpr auto b_offset_2 = BDesc{}.CalculateOffset(make_tuple(e, 0, 2, 0)); constexpr index_t b_offset_2 = BDesc{}.CalculateOffset(make_tuple(e, 0, 2, 0));
constexpr auto b_offset_3 = BDesc{}.CalculateOffset(make_tuple(e, 0, 3, 0)); constexpr index_t b_offset_3 = BDesc{}.CalculateOffset(make_tuple(e, 0, 3, 0));
constexpr auto c_offset_0 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 0)); constexpr index_t c_offset_0 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 0));
constexpr auto c_offset_1 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 0)); constexpr index_t c_offset_1 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 0));
constexpr auto c_offset_2 = CDesc{}.CalculateOffset(make_tuple(k, 0, 2, 0)); constexpr index_t c_offset_2 = CDesc{}.CalculateOffset(make_tuple(k, 0, 2, 0));
constexpr auto c_offset_3 = CDesc{}.CalculateOffset(make_tuple(k, 0, 3, 0)); constexpr index_t c_offset_3 = CDesc{}.CalculateOffset(make_tuple(k, 0, 3, 0));
amd_assembly_outer_product_1x4(p_a[a_offset], amd_assembly_outer_product_1x4(p_a[Number<a_offset>{}],
p_b[b_offset_0], p_b[Number<b_offset_0>{}],
p_b[b_offset_1], p_b[Number<b_offset_1>{}],
p_b[b_offset_2], p_b[Number<b_offset_2>{}],
p_b[b_offset_3], p_b[Number<b_offset_3>{}],
p_c[c_offset_0], p_c[Number<c_offset_0>{}],
p_c[c_offset_1], p_c[Number<c_offset_1>{}],
p_c[c_offset_2], p_c[Number<c_offset_2>{}],
p_c[c_offset_3]); p_c[Number<c_offset_3>{}]);
} }
else else
{ {
static_for<0, H, 1>{}([&](auto h) { static_for<0, H, 1>{}([&](auto h) {
static_for<0, W, 1>{}([&](auto w) { static_for<0, W, 1>{}([&](auto w) {
constexpr auto b_offset = constexpr index_t b_offset =
BDesc{}.CalculateOffset(make_tuple(e, 0, h, w)); BDesc{}.CalculateOffset(make_tuple(e, 0, h, w));
constexpr auto c_offset = constexpr index_t c_offset =
CDesc{}.CalculateOffset(make_tuple(k, 0, h, w)); CDesc{}.CalculateOffset(make_tuple(k, 0, h, w));
p_c[c_offset] += inner_product_with_conversion<FloatC>{}(p_a[a_offset], #if 0
p_b[b_offset]); p_c[Number<c_offset>{}] += inner_product_with_conversion<FloatC>{}(p_a[Number<a_offset>{}],
p_b[Number<b_offset>{}]);
#else
amd_assembly_inner_product(p_a[Number<a_offset>{}],
p_b[Number<b_offset>{}],
p_c[Number<c_offset>{}]);
#endif
}); });
}); });
} }
#else
constexpr index_t a_offset = ADesc{}.CalculateOffset(make_tuple(e, k));
static_for<0, H, 1>{}([&](auto h) {
static_for<0, W, 1>{}([&](auto w) {
constexpr index_t b_offset =
BDesc{}.CalculateOffset(make_tuple(e, 0, h, w));
constexpr index_t c_offset =
CDesc{}.CalculateOffset(make_tuple(k, 0, h, w));
amd_assembly_inner_product(p_a[Number<a_offset>{}],
p_b[Number<b_offset>{}],
p_c[Number<c_offset>{}]);
});
});
#endif
}); });
}); });
} }
......
...@@ -74,7 +74,6 @@ __device__ void amd_assembly_inner_product(const int8x16_t& a, const int8x16_t& ...@@ -74,7 +74,6 @@ __device__ void amd_assembly_inner_product(const int8x16_t& a, const int8x16_t&
c); c);
} }
#if 0
// c0 += inner_product(a, b0) // c0 += inner_product(a, b0)
// c1 += inner_product(a, b1) // c1 += inner_product(a, b1)
__device__ void amd_assembly_outer_product_1x2(float a, float b0, float b1, float& c0, float& c1) __device__ void amd_assembly_outer_product_1x2(float a, float b0, float b1, float& c0, float& c1)
...@@ -438,7 +437,6 @@ __device__ void amd_assembly_outer_product_1x4(int8x16_t a, ...@@ -438,7 +437,6 @@ __device__ void amd_assembly_outer_product_1x4(int8x16_t a,
c2, c2,
c3); c3);
} }
#endif
} // namespace ck } // namespace ck
#endif #endif
...@@ -28,11 +28,11 @@ ...@@ -28,11 +28,11 @@
#endif #endif
// launch bounds // launch bounds
#define CK_USE_LAUNCH_BOUNDS 0 #define CK_USE_LAUNCH_BOUNDS 1
#ifdef CK_USE_LAUNCH_BOUNDS #ifdef CK_USE_LAUNCH_BOUNDS
#define CK_MAX_THREAD_PER_BLOCK 256 #define CK_MAX_THREAD_PER_BLOCK 256
#define CK_MIN_BLOCK_PER_CU 2 #define CK_MIN_BLOCK_PER_CU 1
#endif #endif
// buffer resourse // buffer resourse
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment