Commit f5654649 authored by Chao Liu's avatar Chao Liu
Browse files

added back amd_assembly_outer_product_1x2 and amd_assembly_outer_product_1x4

parent 9d5d6afa
......@@ -212,6 +212,8 @@ struct ThreadwiseGemm_km_kn_mn_v1r1
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto M = CDesc{}.GetLength(I0);
constexpr auto N = CDesc{}.GetLength(I1);
......@@ -223,6 +225,46 @@ struct ThreadwiseGemm_km_kn_mn_v1r1
static_for<0, K, 1>{}([&](auto k) {
static_for<0, M, 1>{}([&](auto m) {
constexpr index_t a_offset = ADesc{}.CalculateOffset(make_tuple(k, m));
if constexpr(N == 2)
{
constexpr index_t b_offset_0 = BDesc{}.CalculateOffset(make_tuple(k, I0));
constexpr index_t b_offset_1 = BDesc{}.CalculateOffset(make_tuple(k, I1));
constexpr index_t c_offset_0 = CDesc{}.CalculateOffset(make_tuple(m, I0));
constexpr index_t c_offset_1 = CDesc{}.CalculateOffset(make_tuple(m, I1));
amd_assembly_outer_product_1x2(a_buf[Number<a_offset>{}],
b_buf[Number<b_offset_0>{}],
b_buf[Number<b_offset_1>{}],
c_buf(Number<c_offset_0>{}),
c_buf(Number<c_offset_1>{}));
}
else if constexpr(N == 4)
{
constexpr index_t b_offset_0 = BDesc{}.CalculateOffset(make_tuple(k, I0));
constexpr index_t b_offset_1 = BDesc{}.CalculateOffset(make_tuple(k, I1));
constexpr index_t b_offset_2 = BDesc{}.CalculateOffset(make_tuple(k, I2));
constexpr index_t b_offset_3 = BDesc{}.CalculateOffset(make_tuple(k, I3));
constexpr index_t c_offset_0 = CDesc{}.CalculateOffset(make_tuple(m, I0));
constexpr index_t c_offset_1 = CDesc{}.CalculateOffset(make_tuple(m, I1));
constexpr index_t c_offset_2 = CDesc{}.CalculateOffset(make_tuple(m, I2));
constexpr index_t c_offset_3 = CDesc{}.CalculateOffset(make_tuple(m, I3));
amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}],
b_buf[Number<b_offset_0>{}],
b_buf[Number<b_offset_1>{}],
b_buf[Number<b_offset_2>{}],
b_buf[Number<b_offset_3>{}],
c_buf(Number<c_offset_0>{}),
c_buf(Number<c_offset_1>{}),
c_buf(Number<c_offset_2>{}),
c_buf(Number<c_offset_3>{}));
}
else
{
static_for<0, N, 1>{}([&](auto n) {
constexpr index_t a_offset =
......@@ -241,6 +283,7 @@ struct ThreadwiseGemm_km_kn_mn_v1r1
a_buf[Number<a_offset>{}], b_buf[Number<b_offset>{}]);
#endif
});
}
});
});
}
......
......@@ -61,72 +61,56 @@ struct ThreadwiseGemm_km_kn_mn_v3
static_for<0, E, 1>{}([&](auto e) {
static_for<0, K, 1>{}([&](auto k) {
#if 0
constexpr auto a_offset = ADesc{}.CalculateOffset(make_tuple(e, k));
constexpr index_t a_offset = ADesc{}.CalculateOffset(make_tuple(e, k));
if constexpr(H == 2 && W == 2)
{
constexpr auto b_offset_0 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 0));
constexpr auto b_offset_1 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 1));
constexpr auto b_offset_2 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 0));
constexpr auto b_offset_3 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 1));
constexpr auto c_offset_0 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 0));
constexpr auto c_offset_1 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 1));
constexpr auto c_offset_2 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 0));
constexpr auto c_offset_3 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 1));
amd_assembly_outer_product_1x4(p_a[a_offset],
p_b[b_offset_0],
p_b[b_offset_1],
p_b[b_offset_2],
p_b[b_offset_3],
p_c[c_offset_0],
p_c[c_offset_1],
p_c[c_offset_2],
p_c[c_offset_3]);
constexpr index_t b_offset_0 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 0));
constexpr index_t b_offset_1 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 1));
constexpr index_t b_offset_2 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 0));
constexpr index_t b_offset_3 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 1));
constexpr index_t c_offset_0 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 0));
constexpr index_t c_offset_1 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 1));
constexpr index_t c_offset_2 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 0));
constexpr index_t c_offset_3 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 1));
amd_assembly_outer_product_1x4(p_a[Number<a_offset>{}],
p_b[Number<b_offset_0>{}],
p_b[Number<b_offset_1>{}],
p_b[Number<b_offset_2>{}],
p_b[Number<b_offset_3>{}],
p_c[Number<c_offset_0>{}],
p_c[Number<c_offset_1>{}],
p_c[Number<c_offset_2>{}],
p_c[Number<c_offset_3>{}]);
}
else if constexpr(H == 4 && W == 1)
{
constexpr auto b_offset_0 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 0));
constexpr auto b_offset_1 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 0));
constexpr auto b_offset_2 = BDesc{}.CalculateOffset(make_tuple(e, 0, 2, 0));
constexpr auto b_offset_3 = BDesc{}.CalculateOffset(make_tuple(e, 0, 3, 0));
constexpr auto c_offset_0 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 0));
constexpr auto c_offset_1 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 0));
constexpr auto c_offset_2 = CDesc{}.CalculateOffset(make_tuple(k, 0, 2, 0));
constexpr auto c_offset_3 = CDesc{}.CalculateOffset(make_tuple(k, 0, 3, 0));
amd_assembly_outer_product_1x4(p_a[a_offset],
p_b[b_offset_0],
p_b[b_offset_1],
p_b[b_offset_2],
p_b[b_offset_3],
p_c[c_offset_0],
p_c[c_offset_1],
p_c[c_offset_2],
p_c[c_offset_3]);
constexpr index_t b_offset_0 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 0));
constexpr index_t b_offset_1 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 0));
constexpr index_t b_offset_2 = BDesc{}.CalculateOffset(make_tuple(e, 0, 2, 0));
constexpr index_t b_offset_3 = BDesc{}.CalculateOffset(make_tuple(e, 0, 3, 0));
constexpr index_t c_offset_0 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 0));
constexpr index_t c_offset_1 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 0));
constexpr index_t c_offset_2 = CDesc{}.CalculateOffset(make_tuple(k, 0, 2, 0));
constexpr index_t c_offset_3 = CDesc{}.CalculateOffset(make_tuple(k, 0, 3, 0));
amd_assembly_outer_product_1x4(p_a[Number<a_offset>{}],
p_b[Number<b_offset_0>{}],
p_b[Number<b_offset_1>{}],
p_b[Number<b_offset_2>{}],
p_b[Number<b_offset_3>{}],
p_c[Number<c_offset_0>{}],
p_c[Number<c_offset_1>{}],
p_c[Number<c_offset_2>{}],
p_c[Number<c_offset_3>{}]);
}
else
{
static_for<0, H, 1>{}([&](auto h) {
static_for<0, W, 1>{}([&](auto w) {
constexpr auto b_offset =
BDesc{}.CalculateOffset(make_tuple(e, 0, h, w));
constexpr auto c_offset =
CDesc{}.CalculateOffset(make_tuple(k, 0, h, w));
p_c[c_offset] += inner_product_with_conversion<FloatC>{}(p_a[a_offset],
p_b[b_offset]);
});
});
}
#else
constexpr index_t a_offset = ADesc{}.CalculateOffset(make_tuple(e, k));
static_for<0, H, 1>{}([&](auto h) {
static_for<0, W, 1>{}([&](auto w) {
constexpr index_t b_offset =
......@@ -134,12 +118,17 @@ struct ThreadwiseGemm_km_kn_mn_v3
constexpr index_t c_offset =
CDesc{}.CalculateOffset(make_tuple(k, 0, h, w));
#if 0
p_c[Number<c_offset>{}] += inner_product_with_conversion<FloatC>{}(p_a[Number<a_offset>{}],
p_b[Number<b_offset>{}]);
#else
amd_assembly_inner_product(p_a[Number<a_offset>{}],
p_b[Number<b_offset>{}],
p_c[Number<c_offset>{}]);
#endif
});
});
#endif
}
});
});
}
......
......@@ -74,7 +74,6 @@ __device__ void amd_assembly_inner_product(const int8x16_t& a, const int8x16_t&
c);
}
#if 0
// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
__device__ void amd_assembly_outer_product_1x2(float a, float b0, float b1, float& c0, float& c1)
......@@ -438,7 +437,6 @@ __device__ void amd_assembly_outer_product_1x4(int8x16_t a,
c2,
c3);
}
#endif
} // namespace ck
#endif
......@@ -28,11 +28,11 @@
#endif
// launch bounds
#define CK_USE_LAUNCH_BOUNDS 0
#define CK_USE_LAUNCH_BOUNDS 1
#ifdef CK_USE_LAUNCH_BOUNDS
#define CK_MAX_THREAD_PER_BLOCK 256
#define CK_MIN_BLOCK_PER_CU 2
#define CK_MIN_BLOCK_PER_CU 1
#endif
// buffer resourse
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment