Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
f2ac7832
Commit
f2ac7832
authored
Aug 11, 2021
by
Chao Liu
Browse files
make innner product compatiable on gfx900
parent
4e57b30a
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
234 additions
and
215 deletions
+234
-215
composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp
...include/tensor_operation/threadwise_contraction_dlops.hpp
+4
-5
composable_kernel/include/utility/amd_dlop.hpp
composable_kernel/include/utility/amd_dlop.hpp
+0
-188
composable_kernel/include/utility/amd_inline_asm.hpp
composable_kernel/include/utility/amd_inline_asm.hpp
+2
-0
composable_kernel/include/utility/common_header.hpp
composable_kernel/include/utility/common_header.hpp
+2
-4
composable_kernel/include/utility/config.hpp
composable_kernel/include/utility/config.hpp
+14
-13
composable_kernel/include/utility/inner_product.hpp
composable_kernel/include/utility/inner_product.hpp
+207
-0
host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
...ution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
+1
-1
host/driver_offline/src/conv_fwd_driver_offline.cpp
host/driver_offline/src/conv_fwd_driver_offline.cpp
+4
-4
No files found.
composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp
View file @
f2ac7832
...
@@ -97,10 +97,9 @@ struct ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1
...
@@ -97,10 +97,9 @@ struct ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1
CThreadDesc_TM0_TM1_TN0_TN1
{}.
CalculateOffset
(
CThreadDesc_TM0_TM1_TN0_TN1
{}.
CalculateOffset
(
c_origin_idx
+
make_multi_index
(
tm0
,
tm1
,
tn0
,
tn1
));
c_origin_idx
+
make_multi_index
(
tm0
,
tm1
,
tn0
,
tn1
));
amd_inner_product_dlop
<
FloatA
,
FloatB
,
FloatC
>
(
inner_product
<
FloatA
,
FloatB
,
FloatC
>
(
a_buf
[
Number
<
a_offset
>
{}],
a_buf
[
Number
<
a_offset
>
{}],
b_buf
[
Number
<
b_offset
>
{}],
b_buf
[
Number
<
b_offset
>
{}],
c_buf
(
Number
<
c_offset
>
{}));
c_buf
(
Number
<
c_offset
>
{}));
});
});
});
});
});
});
...
@@ -214,7 +213,7 @@ struct ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_
...
@@ -214,7 +213,7 @@ struct ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_
CThreadDesc_TM0_TM1_TN0_TN1
{}.
CalculateOffset
(
CThreadDesc_TM0_TM1_TN0_TN1
{}.
CalculateOffset
(
c_origin_idx
+
make_multi_index
(
tm0
,
tm1
,
tn0
,
tn1
));
c_origin_idx
+
make_multi_index
(
tm0
,
tm1
,
tn0
,
tn1
));
amd_
inner_product
_dlop
<
a_vector_t
,
b_vector_t
,
FloatC
>
(
inner_product
<
a_vector_t
,
b_vector_t
,
FloatC
>
(
a_vec
.
template
AsType
<
a_vector_t
>()[
I0
],
a_vec
.
template
AsType
<
a_vector_t
>()[
I0
],
b_vec
.
template
AsType
<
b_vector_t
>()[
I0
],
b_vec
.
template
AsType
<
b_vector_t
>()[
I0
],
c_buf
(
Number
<
c_offset
>
{}));
c_buf
(
Number
<
c_offset
>
{}));
...
...
composable_kernel/include/utility/amd_dlop.hpp
deleted
100644 → 0
View file @
4e57b30a
#ifndef CK_AMD_DLOP_HPP
#define CK_AMD_DLOP_HPP
#include "data_type.hpp"
namespace
ck
{
template
<
typename
TA
,
typename
TB
,
typename
TC
>
__device__
void
amd_inner_product_dlop
(
const
TA
&
a
,
const
TB
&
b
,
TC
&
c
);
template
<
>
__device__
void
amd_inner_product_dlop
<
float
,
float
,
float
>
(
const
float
&
a
,
const
float
&
b
,
float
&
c
)
{
#if CK_USE_AMD_DLOP_INLINE_ASM
asm
volatile
(
"
\n
\
v_fmac_f32 %0, %1, %2
\n
\
"
:
"=v"
(
c
)
:
"v"
(
a
),
"v"
(
b
),
"0"
(
c
));
#else
c
+=
a
*
b
;
#endif
}
template
<
>
__device__
void
amd_inner_product_dlop
<
float2_t
,
float2_t
,
float
>
(
const
float2_t
&
a
,
const
float2_t
&
b
,
float
&
c
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
amd_inner_product_dlop
(
vector_type
<
float
,
2
>
{
a
}.
AsType
<
float
>
()[
I0
],
vector_type
<
float
,
2
>
{
b
}.
AsType
<
float
>
()[
I0
],
c
);
amd_inner_product_dlop
(
vector_type
<
float
,
2
>
{
a
}.
AsType
<
float
>
()[
I1
],
vector_type
<
float
,
2
>
{
b
}.
AsType
<
float
>
()[
I1
],
c
);
}
template
<
>
__device__
void
amd_inner_product_dlop
<
float4_t
,
float4_t
,
float
>
(
const
float4_t
&
a
,
const
float4_t
&
b
,
float
&
c
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
amd_inner_product_dlop
(
vector_type
<
float
,
4
>
{
a
}.
AsType
<
float
>
()[
I0
],
vector_type
<
float
,
4
>
{
b
}.
AsType
<
float
>
()[
I0
],
c
);
amd_inner_product_dlop
(
vector_type
<
float
,
4
>
{
a
}.
AsType
<
float
>
()[
I1
],
vector_type
<
float
,
4
>
{
b
}.
AsType
<
float
>
()[
I1
],
c
);
amd_inner_product_dlop
(
vector_type
<
float
,
4
>
{
a
}.
AsType
<
float
>
()[
I2
],
vector_type
<
float
,
4
>
{
b
}.
AsType
<
float
>
()[
I2
],
c
);
amd_inner_product_dlop
(
vector_type
<
float
,
4
>
{
a
}.
AsType
<
float
>
()[
I3
],
vector_type
<
float
,
4
>
{
b
}.
AsType
<
float
>
()[
I3
],
c
);
}
#if CK_USE_AMD_DLOP
template
<
>
__device__
void
amd_inner_product_dlop
<
half2_t
,
half2_t
,
float
>
(
const
half2_t
&
a
,
const
half2_t
&
b
,
float
&
c
)
{
#if CK_USE_AMD_DLOP_INLINE_ASM
asm
volatile
(
"
\n
\
v_dot2_f32_f16 %0, %1, %2, %0
\n
\
"
:
"=v"
(
c
)
:
"v"
(
a
),
"v"
(
b
),
"0"
(
c
));
#else
c
=
__builtin_amdgcn_sdot2
(
a
,
b
,
c
,
false
);
#endif
}
template
<
>
__device__
void
amd_inner_product_dlop
<
half4_t
,
half4_t
,
float
>
(
const
half4_t
&
a
,
const
half4_t
&
b
,
float
&
c
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
amd_inner_product_dlop
(
vector_type
<
half_t
,
4
>
{
a
}.
AsType
<
half2_t
>
()[
I0
],
vector_type
<
half_t
,
4
>
{
b
}.
AsType
<
half2_t
>
()[
I0
],
c
);
amd_inner_product_dlop
(
vector_type
<
half_t
,
4
>
{
a
}.
AsType
<
half2_t
>
()[
I1
],
vector_type
<
half_t
,
4
>
{
b
}.
AsType
<
half2_t
>
()[
I1
],
c
);
}
template
<
>
__device__
void
amd_inner_product_dlop
<
half8_t
,
half8_t
,
float
>
(
const
half8_t
&
a
,
const
half8_t
&
b
,
float
&
c
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
amd_inner_product_dlop
(
vector_type
<
half_t
,
8
>
{
a
}.
AsType
<
half2_t
>
()[
I0
],
vector_type
<
half_t
,
8
>
{
b
}.
AsType
<
half2_t
>
()[
I0
],
c
);
amd_inner_product_dlop
(
vector_type
<
half_t
,
8
>
{
a
}.
AsType
<
half2_t
>
()[
I1
],
vector_type
<
half_t
,
8
>
{
b
}.
AsType
<
half2_t
>
()[
I1
],
c
);
amd_inner_product_dlop
(
vector_type
<
half_t
,
8
>
{
a
}.
AsType
<
half2_t
>
()[
I2
],
vector_type
<
half_t
,
8
>
{
b
}.
AsType
<
half2_t
>
()[
I2
],
c
);
amd_inner_product_dlop
(
vector_type
<
half_t
,
8
>
{
a
}.
AsType
<
half2_t
>
()[
I3
],
vector_type
<
half_t
,
8
>
{
b
}.
AsType
<
half2_t
>
()[
I3
],
c
);
}
template
<
>
__device__
void
amd_inner_product_dlop
<
int8x4_t
,
int8x4_t
,
int32_t
>
(
const
int8x4_t
&
a
,
const
int8x4_t
&
b
,
int32_t
&
c
)
{
#if CK_USE_AMD_DLOP_INLINE_ASM
asm
volatile
(
"
\n
\
v_dot4_i32_i8 %0, %1, %2, %0
\n
\
"
:
"=v"
(
c
)
:
"v"
(
as_type
<
int32_t
>
(
a
)),
"v"
(
as_type
<
int32_t
>
(
b
)),
"0"
(
c
));
#else
c
=
__builtin_amdgcn_sdot4
(
as_type
<
int32_t
>
(
a
),
as_type
<
int32_t
>
(
b
),
c
,
false
);
#endif
}
template
<
>
__device__
void
amd_inner_product_dlop
<
int8x8_t
,
int8x8_t
,
int32_t
>
(
const
int8x8_t
&
a
,
const
int8x8_t
&
b
,
int32_t
&
c
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
amd_inner_product_dlop
(
vector_type
<
int8_t
,
8
>
{
a
}.
AsType
<
int8x4_t
>
()[
I0
],
vector_type
<
int8_t
,
8
>
{
b
}.
AsType
<
int8x4_t
>
()[
I0
],
c
);
amd_inner_product_dlop
(
vector_type
<
int8_t
,
8
>
{
a
}.
AsType
<
int8x4_t
>
()[
I1
],
vector_type
<
int8_t
,
8
>
{
b
}.
AsType
<
int8x4_t
>
()[
I1
],
c
);
}
template
<
>
__device__
void
amd_inner_product_dlop
<
int8x16_t
,
int8x16_t
,
int32_t
>
(
const
int8x16_t
&
a
,
const
int8x16_t
&
b
,
int32_t
&
c
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
amd_inner_product_dlop
(
vector_type
<
int8_t
,
16
>
{
a
}.
AsType
<
int8x4_t
>
()[
I0
],
vector_type
<
int8_t
,
16
>
{
b
}.
AsType
<
int8x4_t
>
()[
I0
],
c
);
amd_inner_product_dlop
(
vector_type
<
int8_t
,
16
>
{
a
}.
AsType
<
int8x4_t
>
()[
I1
],
vector_type
<
int8_t
,
16
>
{
b
}.
AsType
<
int8x4_t
>
()[
I1
],
c
);
amd_inner_product_dlop
(
vector_type
<
int8_t
,
16
>
{
a
}.
AsType
<
int8x4_t
>
()[
I2
],
vector_type
<
int8_t
,
16
>
{
b
}.
AsType
<
int8x4_t
>
()[
I2
],
c
);
amd_inner_product_dlop
(
vector_type
<
int8_t
,
16
>
{
a
}.
AsType
<
int8x4_t
>
()[
I3
],
vector_type
<
int8_t
,
16
>
{
b
}.
AsType
<
int8x4_t
>
()[
I3
],
c
);
}
#endif // CK_USE_AMD_DLOP
}
// namespace ck
#endif
composable_kernel/include/utility/amd_inline_asm.hpp
View file @
f2ac7832
...
@@ -4,6 +4,8 @@
...
@@ -4,6 +4,8 @@
#include "data_type.hpp"
#include "data_type.hpp"
#include "c_style_pointer_cast.hpp"
#include "c_style_pointer_cast.hpp"
// TODO: deprecate all amd_assembly_outer_product_xxx
namespace
ck
{
namespace
ck
{
// c0 += inner_product(a, b0)
// c0 += inner_product(a, b0)
...
...
composable_kernel/include/utility/common_header.hpp
View file @
f2ac7832
...
@@ -31,15 +31,13 @@
...
@@ -31,15 +31,13 @@
#include "static_buffer.hpp"
#include "static_buffer.hpp"
#include "dynamic_buffer.hpp"
#include "dynamic_buffer.hpp"
#include "inner_product.hpp"
// TODO: remove this
// TODO: remove this
#if CK_USE_AMD_INLINE_ASM
#if CK_USE_AMD_INLINE_ASM
#include "amd_inline_asm.hpp"
#include "amd_inline_asm.hpp"
#endif
#endif
#if CK_USE_AMD_DLOP
#include "amd_dlop.hpp"
#endif
#if CK_USE_AMD_XDLOPS
#if CK_USE_AMD_XDLOPS
#include "amd_xdlops.hpp"
#include "amd_xdlops.hpp"
#endif
#endif
...
...
composable_kernel/include/utility/config.hpp
View file @
f2ac7832
...
@@ -14,12 +14,7 @@
...
@@ -14,12 +14,7 @@
// should enable one and only one GPU target
// should enable one and only one GPU target
#if !(defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \
#if !(defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \
defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A) || defined(CK_AMD_GPU_GFX1030))
defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A) || defined(CK_AMD_GPU_GFX1030))
#error Need to define a single GPU target
#error Need to define (only) one GPU target
#endif
// HIP version
#ifndef CK_HIP_VERSION_FLAT
#define CK_HIP_VERSION_FLAT 0
#endif
#endif
// launch bounds
// launch bounds
...
@@ -38,6 +33,16 @@
...
@@ -38,6 +33,16 @@
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
#endif
#endif
// FMA instruction
#if defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900)
#define CK_USE_AMD_V_MAC_F32
#elif defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90a) || \
defined(CK_AMD_GPU_GFX1030)
#define CK_USE_AMD_V_FMAC_F32
#define CK_USE_AMD_V_DOT2_F32_F16
#define CK_USE_AMD_V_DOT4_I32_I8
#endif
// multi index
// multi index
#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0
#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0
...
@@ -46,13 +51,9 @@
...
@@ -46,13 +51,9 @@
#define CK_USE_AMD_INLINE_ASM 1
#define CK_USE_AMD_INLINE_ASM 1
#endif
#endif
// AMD DLOPS
// AMD inner product (DLOP)
#ifndef CK_USE_AMD_DLOP
#ifndef CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
#define CK_USE_AMD_DLOP 1
#define CK_USE_AMD_INNER_PRODUCT_INLINE_ASM 1
#endif
#ifndef CK_USE_AMD_DLOP_INLINE_ASM
#define CK_USE_AMD_DLOP_INLINE_ASM 1
#endif
#endif
// AMD buffer addressing
// AMD buffer addressing
...
...
composable_kernel/include/utility/inner_product.hpp
0 → 100644
View file @
f2ac7832
#ifndef CK_INNER_PRODUCT_HPP
#define CK_INNER_PRODUCT_HPP
#include "data_type.hpp"
namespace
ck
{
template
<
typename
TA
,
typename
TB
,
typename
TC
>
__device__
void
inner_product
(
const
TA
&
a
,
const
TB
&
b
,
TC
&
c
);
template
<
>
__device__
void
inner_product
<
float
,
float
,
float
>
(
const
float
&
a
,
const
float
&
b
,
float
&
c
)
{
#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_MAC_F32)
asm
volatile
(
"
\n
\
v_mac_f32 %0, %1, %2
\n
\
"
:
"=v"
(
c
)
:
"v"
(
a
),
"v"
(
b
),
"0"
(
c
));
#elif CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_FMAC_F32)
asm
volatile
(
"
\n
\
v_fmac_f32 %0, %1, %2
\n
\
"
:
"=v"
(
c
)
:
"v"
(
a
),
"v"
(
b
),
"0"
(
c
));
#else
c
+=
a
*
b
;
#endif
}
template
<
>
__device__
void
inner_product
<
float2_t
,
float2_t
,
float
>
(
const
float2_t
&
a
,
const
float2_t
&
b
,
float
&
c
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
inner_product
(
vector_type
<
float
,
2
>
{
a
}.
AsType
<
float
>
()[
I0
],
vector_type
<
float
,
2
>
{
b
}.
AsType
<
float
>
()[
I0
],
c
);
inner_product
(
vector_type
<
float
,
2
>
{
a
}.
AsType
<
float
>
()[
I1
],
vector_type
<
float
,
2
>
{
b
}.
AsType
<
float
>
()[
I1
],
c
);
}
template
<
>
__device__
void
inner_product
<
float4_t
,
float4_t
,
float
>
(
const
float4_t
&
a
,
const
float4_t
&
b
,
float
&
c
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
inner_product
(
vector_type
<
float
,
4
>
{
a
}.
AsType
<
float
>
()[
I0
],
vector_type
<
float
,
4
>
{
b
}.
AsType
<
float
>
()[
I0
],
c
);
inner_product
(
vector_type
<
float
,
4
>
{
a
}.
AsType
<
float
>
()[
I1
],
vector_type
<
float
,
4
>
{
b
}.
AsType
<
float
>
()[
I1
],
c
);
inner_product
(
vector_type
<
float
,
4
>
{
a
}.
AsType
<
float
>
()[
I2
],
vector_type
<
float
,
4
>
{
b
}.
AsType
<
float
>
()[
I2
],
c
);
inner_product
(
vector_type
<
float
,
4
>
{
a
}.
AsType
<
float
>
()[
I3
],
vector_type
<
float
,
4
>
{
b
}.
AsType
<
float
>
()[
I3
],
c
);
}
template
<
>
__device__
void
inner_product
<
half2_t
,
half2_t
,
float
>
(
const
half2_t
&
a
,
const
half2_t
&
b
,
float
&
c
)
{
#if defined(CK_USE_AMD_V_DOT2_F32_F16)
#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
asm
volatile
(
"
\n
\
v_dot2_f32_f16 %0, %1, %2, %0
\n
\
"
:
"=v"
(
c
)
:
"v"
(
a
),
"v"
(
b
),
"0"
(
c
));
#else
c
=
__builtin_amdgcn_sdot2
(
a
,
b
,
c
,
false
);
#endif
#else
const
auto
convert
=
type_convert
<
int32_t
>
{};
const
vector_type
<
half_t
,
2
>
a_vector
{
a
};
const
vector_type
<
half_t
,
2
>
b_vector
{
b
};
static_for
<
0
,
2
,
1
>
{}([
&
](
auto
i
)
{
c
+=
convert
(
a_vector
.
AsType
<
half_t
>
()[
i
])
*
convert
(
b_vector
.
AsType
<
half_t
>
()[
i
]);
});
#endif
}
template
<
>
__device__
void
inner_product
<
half4_t
,
half4_t
,
float
>
(
const
half4_t
&
a
,
const
half4_t
&
b
,
float
&
c
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
inner_product
(
vector_type
<
half_t
,
4
>
{
a
}.
AsType
<
half2_t
>
()[
I0
],
vector_type
<
half_t
,
4
>
{
b
}.
AsType
<
half2_t
>
()[
I0
],
c
);
inner_product
(
vector_type
<
half_t
,
4
>
{
a
}.
AsType
<
half2_t
>
()[
I1
],
vector_type
<
half_t
,
4
>
{
b
}.
AsType
<
half2_t
>
()[
I1
],
c
);
}
template
<
>
__device__
void
inner_product
<
half8_t
,
half8_t
,
float
>
(
const
half8_t
&
a
,
const
half8_t
&
b
,
float
&
c
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
inner_product
(
vector_type
<
half_t
,
8
>
{
a
}.
AsType
<
half2_t
>
()[
I0
],
vector_type
<
half_t
,
8
>
{
b
}.
AsType
<
half2_t
>
()[
I0
],
c
);
inner_product
(
vector_type
<
half_t
,
8
>
{
a
}.
AsType
<
half2_t
>
()[
I1
],
vector_type
<
half_t
,
8
>
{
b
}.
AsType
<
half2_t
>
()[
I1
],
c
);
inner_product
(
vector_type
<
half_t
,
8
>
{
a
}.
AsType
<
half2_t
>
()[
I2
],
vector_type
<
half_t
,
8
>
{
b
}.
AsType
<
half2_t
>
()[
I2
],
c
);
inner_product
(
vector_type
<
half_t
,
8
>
{
a
}.
AsType
<
half2_t
>
()[
I3
],
vector_type
<
half_t
,
8
>
{
b
}.
AsType
<
half2_t
>
()[
I3
],
c
);
}
template
<
>
__device__
void
inner_product
<
int8x4_t
,
int8x4_t
,
int32_t
>
(
const
int8x4_t
&
a
,
const
int8x4_t
&
b
,
int32_t
&
c
)
{
#if defined(CK_USE_DOT4_I32_I8)
#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
asm
volatile
(
"
\n
\
v_dot4_i32_i8 %0, %1, %2, %0
\n
\
"
:
"=v"
(
c
)
:
"v"
(
as_type
<
int32_t
>
(
a
)),
"v"
(
as_type
<
int32_t
>
(
b
)),
"0"
(
c
));
#else
c
=
__builtin_amdgcn_sdot4
(
as_type
<
int32_t
>
(
a
),
as_type
<
int32_t
>
(
b
),
c
,
false
);
#endif
#else
const
auto
convert
=
type_convert
<
int32_t
>
{};
const
vector_type
<
int8_t
,
4
>
a_vector
{
a
};
const
vector_type
<
int8_t
,
4
>
b_vector
{
b
};
static_for
<
0
,
4
,
1
>
{}([
&
](
auto
i
)
{
c
+=
convert
(
a_vector
.
AsType
<
int8_t
>
()[
i
])
*
convert
(
b_vector
.
AsType
<
int8_t
>
()[
i
]);
});
#endif
}
template
<
>
__device__
void
inner_product
<
int8x8_t
,
int8x8_t
,
int32_t
>
(
const
int8x8_t
&
a
,
const
int8x8_t
&
b
,
int32_t
&
c
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
inner_product
(
vector_type
<
int8_t
,
8
>
{
a
}.
AsType
<
int8x4_t
>
()[
I0
],
vector_type
<
int8_t
,
8
>
{
b
}.
AsType
<
int8x4_t
>
()[
I0
],
c
);
inner_product
(
vector_type
<
int8_t
,
8
>
{
a
}.
AsType
<
int8x4_t
>
()[
I1
],
vector_type
<
int8_t
,
8
>
{
b
}.
AsType
<
int8x4_t
>
()[
I1
],
c
);
}
template
<
>
__device__
void
inner_product
<
int8x16_t
,
int8x16_t
,
int32_t
>
(
const
int8x16_t
&
a
,
const
int8x16_t
&
b
,
int32_t
&
c
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
inner_product
(
vector_type
<
int8_t
,
16
>
{
a
}.
AsType
<
int8x4_t
>
()[
I0
],
vector_type
<
int8_t
,
16
>
{
b
}.
AsType
<
int8x4_t
>
()[
I0
],
c
);
inner_product
(
vector_type
<
int8_t
,
16
>
{
a
}.
AsType
<
int8x4_t
>
()[
I1
],
vector_type
<
int8_t
,
16
>
{
b
}.
AsType
<
int8x4_t
>
()[
I1
],
c
);
inner_product
(
vector_type
<
int8_t
,
16
>
{
a
}.
AsType
<
int8x4_t
>
()[
I2
],
vector_type
<
int8_t
,
16
>
{
b
}.
AsType
<
int8x4_t
>
()[
I2
],
c
);
inner_product
(
vector_type
<
int8_t
,
16
>
{
a
}.
AsType
<
int8x4_t
>
()[
I3
],
vector_type
<
int8_t
,
16
>
{
b
}.
AsType
<
int8x4_t
>
()[
I3
],
c
);
}
}
// namespace ck
#endif
host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
View file @
f2ac7832
...
@@ -48,7 +48,7 @@ void device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
...
@@ -48,7 +48,7 @@ void device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
const
auto
wei_desc_k_c_y_x
=
make_naive_tensor_descriptor_packed
(
wei_k_c_y_x_lengths
);
const
auto
wei_desc_k_c_y_x
=
make_naive_tensor_descriptor_packed
(
wei_k_c_y_x_lengths
);
const
auto
out_desc_n_k_ho_wo
=
make_naive_tensor_descriptor_packed
(
out_n_k_ho_wo_lengths
);
const
auto
out_desc_n_k_ho_wo
=
make_naive_tensor_descriptor_packed
(
out_n_k_ho_wo_lengths
);
#if
0
#if
1
// [8, 1, 128, 1] * [8, 4, 32, 1] = [1, 128, 4, 32] for fp32
// [8, 1, 128, 1] * [8, 4, 32, 1] = [1, 128, 4, 32] for fp32
// cdata = 64, BlockSize = 256
// cdata = 64, BlockSize = 256
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
...
...
host/driver_offline/src/conv_fwd_driver_offline.cpp
View file @
f2ac7832
...
@@ -20,9 +20,9 @@
...
@@ -20,9 +20,9 @@
#include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#define USE_MODE 1
#define USE_MODE 1
#define USE_CONV_FWD_V4R4_NCHW
0
#define USE_CONV_FWD_V4R4_NCHW
1
#define USE_CONV_FWD_V4R4R2_NHWC
1
#define USE_CONV_FWD_V4R4R2_NHWC
0
#define USE_CONV_FWD_V6R1_NCHW
1
#define USE_CONV_FWD_V6R1_NCHW
0
#define USE_CONV_FWD_V5R1_NCHW 0
#define USE_CONV_FWD_V5R1_NCHW 0
#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
...
@@ -126,7 +126,7 @@ int main(int argc, char* argv[])
...
@@ -126,7 +126,7 @@ int main(int argc, char* argv[])
const
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
const
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
#endif
#endif
#if
0
#if
1
using
in_data_t
=
float
;
using
in_data_t
=
float
;
using
acc_data_t
=
float
;
using
acc_data_t
=
float
;
using
out_data_t
=
float
;
using
out_data_t
=
float
;
...
...
gaoqiong
@gaoqiong
mentioned in commit
dfb80c4e
·
Dec 05, 2023
mentioned in commit
dfb80c4e
mentioned in commit dfb80c4e39ec7b304c3ebc88bab2a204bc4906b9
Toggle commit list
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment