"megatron/git@developer.sourcefind.cn:OpenDAS/megatron-lm.git" did not exist on "bf97f933db4b8b8de1582bb29a0065ec2e80932f"
Commit 64877b2c authored by aska-0096's avatar aska-0096
Browse files

sanity pass

parent 7761e523
...@@ -692,8 +692,16 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle ...@@ -692,8 +692,16 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle
constexpr auto WmmaK = 16; constexpr auto WmmaK = 16;
constexpr auto KPack = math::integer_least_multiple(K1, WmmaK); constexpr auto KPack = math::integer_least_multiple(K1, WmmaK);
// In inline asm mode, you can choose either
// 1. BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO
// This one generate clear assembly code, but performance is low.
// Or
// 2. BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle
// This one have redundant "ds_load" instruction because compiler have limitation in
// optimize code with inline assembly. Though the generated code has more line of code,
// performance is high.
auto blockwise_gemm = auto blockwise_gemm =
BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle<BlockSize, BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO<BlockSize,
ADataType, ADataType,
BDataType, BDataType,
AccDataType, AccDataType,
......
...@@ -358,7 +358,7 @@ __device__ void amd_assembly_outer_product_1x4(int8x16_t a, ...@@ -358,7 +358,7 @@ __device__ void amd_assembly_outer_product_1x4(int8x16_t a,
// Ranged input operand // Ranged input operand
__device__ void amd_assembly_wmma_f32_16x16x16_f16_w32(half16_t a, half16_t b, float8_t& c) __device__ void amd_assembly_wmma_f32_16x16x16_f16_w32(half16_t a, half16_t b, float8_t& c)
{ {
#if defined(__gfx11__) #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__)
asm volatile("v_wmma_f32_16x16x16_f16 %0, %1, %2, %0" : "=v"(c) : "v"(a), "v"(b), "0"(c)); asm volatile("v_wmma_f32_16x16x16_f16 %0, %1, %2, %0" : "=v"(c) : "v"(a), "v"(b), "0"(c));
#else #else
ignore = a; ignore = a;
......
...@@ -23,11 +23,11 @@ struct intrin_wmma_f32_16x16x16_f16_w32<16, 16> ...@@ -23,11 +23,11 @@ struct intrin_wmma_f32_16x16x16_f16_w32<16, 16>
{ {
// * Inline assembly need to elimate the duplicated data load, compiler won't help you // * Inline assembly need to elimate the duplicated data load, compiler won't help you
// delete them. // delete them.
// amd_assembly_wmma_f32_16x16x16_f16_w32( amd_assembly_wmma_f32_16x16x16_f16_w32(
// reg_a, reg_b, reg_c.template AsType<float8_t>()(Number<0>{})); reg_a, reg_b, reg_c.template AsType<float8_t>()(Number<0>{}));
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__)
reg_c.template AsType<float8_t>()(Number<0>{}) = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32( // reg_c.template AsType<float8_t>()(Number<0>{}) = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(
reg_a, reg_b, reg_c.template AsType<float8_t>()[Number<0>{}]); // reg_a, reg_b, reg_c.template AsType<float8_t>()[Number<0>{}]);
#else #else
ignore = reg_a; ignore = reg_a;
ignore = reg_b; ignore = reg_b;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment