Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
f5654649
"...resnet50_tensorflow.git" did not exist on "a1a7aaca9c45d154b3a2641be40bcb93aa5f3c8d"
Commit
f5654649
authored
Apr 22, 2021
by
Chao Liu
Browse files
added back amd_assembly_outer_product_1x2 and amd_assembly_outer_product_1x4
parent
9d5d6afa
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
107 additions
and
77 deletions
+107
-77
composable_kernel/include/tensor_operation/threadwise_gemm_v2.hpp
...le_kernel/include/tensor_operation/threadwise_gemm_v2.hpp
+56
-13
composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
...le_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
+49
-60
composable_kernel/include/utility/amd_inline_asm.hpp
composable_kernel/include/utility/amd_inline_asm.hpp
+0
-2
composable_kernel/include/utility/config.amd.hpp.in
composable_kernel/include/utility/config.amd.hpp.in
+2
-2
No files found.
composable_kernel/include/tensor_operation/threadwise_gemm_v2.hpp
View file @
f5654649
...
@@ -212,6 +212,8 @@ struct ThreadwiseGemm_km_kn_mn_v1r1
...
@@ -212,6 +212,8 @@ struct ThreadwiseGemm_km_kn_mn_v1r1
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
M
=
CDesc
{}.
GetLength
(
I0
);
constexpr
auto
M
=
CDesc
{}.
GetLength
(
I0
);
constexpr
auto
N
=
CDesc
{}.
GetLength
(
I1
);
constexpr
auto
N
=
CDesc
{}.
GetLength
(
I1
);
...
@@ -223,24 +225,65 @@ struct ThreadwiseGemm_km_kn_mn_v1r1
...
@@ -223,24 +225,65 @@ struct ThreadwiseGemm_km_kn_mn_v1r1
static_for
<
0
,
K
,
1
>
{}([
&
](
auto
k
)
{
static_for
<
0
,
K
,
1
>
{}([
&
](
auto
k
)
{
static_for
<
0
,
M
,
1
>
{}([
&
](
auto
m
)
{
static_for
<
0
,
M
,
1
>
{}([
&
](
auto
m
)
{
static_for
<
0
,
N
,
1
>
{}([
&
](
auto
n
)
{
constexpr
index_t
a_offset
=
ADesc
{}.
CalculateOffset
(
make_tuple
(
k
,
m
));
if
constexpr
(
N
==
2
)
{
constexpr
index_t
b_offset_0
=
BDesc
{}.
CalculateOffset
(
make_tuple
(
k
,
I0
));
constexpr
index_t
b_offset_1
=
BDesc
{}.
CalculateOffset
(
make_tuple
(
k
,
I1
));
constexpr
index_t
a_offset
=
constexpr
index_t
c_offset_0
=
CDesc
{}.
CalculateOffset
(
make_tuple
(
m
,
I0
));
ADesc
{}.
CalculateOffset
(
a_origin_idx
+
make_tuple
(
k
,
m
));
constexpr
index_t
c_offset_1
=
CDesc
{}.
CalculateOffset
(
make_tuple
(
m
,
I1
));
constexpr
index_t
b_offset
=
BDesc
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
k
,
n
));
amd_assembly_outer_product_1x2
(
a_buf
[
Number
<
a_offset
>
{}],
constexpr
index_t
c_offset
=
b_buf
[
Number
<
b_offset_0
>
{}],
CDesc
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
m
,
n
));
b_buf
[
Number
<
b_offset_1
>
{}],
c_buf
(
Number
<
c_offset_0
>
{}),
c_buf
(
Number
<
c_offset_1
>
{}));
}
else
if
constexpr
(
N
==
4
)
{
constexpr
index_t
b_offset_0
=
BDesc
{}.
CalculateOffset
(
make_tuple
(
k
,
I0
));
constexpr
index_t
b_offset_1
=
BDesc
{}.
CalculateOffset
(
make_tuple
(
k
,
I1
));
constexpr
index_t
b_offset_2
=
BDesc
{}.
CalculateOffset
(
make_tuple
(
k
,
I2
));
constexpr
index_t
b_offset_3
=
BDesc
{}.
CalculateOffset
(
make_tuple
(
k
,
I3
));
constexpr
index_t
c_offset_0
=
CDesc
{}.
CalculateOffset
(
make_tuple
(
m
,
I0
));
constexpr
index_t
c_offset_1
=
CDesc
{}.
CalculateOffset
(
make_tuple
(
m
,
I1
));
constexpr
index_t
c_offset_2
=
CDesc
{}.
CalculateOffset
(
make_tuple
(
m
,
I2
));
constexpr
index_t
c_offset_3
=
CDesc
{}.
CalculateOffset
(
make_tuple
(
m
,
I3
));
amd_assembly_outer_product_1x4
(
a_buf
[
Number
<
a_offset
>
{}],
b_buf
[
Number
<
b_offset_0
>
{}],
b_buf
[
Number
<
b_offset_1
>
{}],
b_buf
[
Number
<
b_offset_2
>
{}],
b_buf
[
Number
<
b_offset_3
>
{}],
c_buf
(
Number
<
c_offset_0
>
{}),
c_buf
(
Number
<
c_offset_1
>
{}),
c_buf
(
Number
<
c_offset_2
>
{}),
c_buf
(
Number
<
c_offset_3
>
{}));
}
else
{
static_for
<
0
,
N
,
1
>
{}([
&
](
auto
n
)
{
constexpr
index_t
a_offset
=
ADesc
{}.
CalculateOffset
(
a_origin_idx
+
make_tuple
(
k
,
m
));
constexpr
index_t
b_offset
=
BDesc
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
k
,
n
));
constexpr
index_t
c_offset
=
CDesc
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
m
,
n
));
#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
amd_assembly_inner_product
(
a_buf
[
Number
<
a_offset
>
{}],
amd_assembly_inner_product
(
a_buf
[
Number
<
a_offset
>
{}],
b_buf
[
Number
<
b_offset
>
{}],
b_buf
[
Number
<
b_offset
>
{}],
c_buf
(
Number
<
c_offset
>
{}));
c_buf
(
Number
<
c_offset
>
{}));
#else
#else
c_buf
(
Number
<
c_offset
>
{})
+=
inner_product_with_conversion
<
FloatC
>
{}(
c_buf
(
Number
<
c_offset
>
{})
+=
inner_product_with_conversion
<
FloatC
>
{}(
a_buf
[
Number
<
a_offset
>
{}],
b_buf
[
Number
<
b_offset
>
{}]);
a_buf
[
Number
<
a_offset
>
{}],
b_buf
[
Number
<
b_offset
>
{}]);
#endif
#endif
});
});
}
});
});
});
});
}
}
...
...
composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
View file @
f5654649
...
@@ -61,85 +61,74 @@ struct ThreadwiseGemm_km_kn_mn_v3
...
@@ -61,85 +61,74 @@ struct ThreadwiseGemm_km_kn_mn_v3
static_for
<
0
,
E
,
1
>
{}([
&
](
auto
e
)
{
static_for
<
0
,
E
,
1
>
{}([
&
](
auto
e
)
{
static_for
<
0
,
K
,
1
>
{}([
&
](
auto
k
)
{
static_for
<
0
,
K
,
1
>
{}([
&
](
auto
k
)
{
#if 0
constexpr
index_t
a_offset
=
ADesc
{}.
CalculateOffset
(
make_tuple
(
e
,
k
));
constexpr auto a_offset = ADesc{}.CalculateOffset(make_tuple(e, k));
if
constexpr
(
H
==
2
&&
W
==
2
)
if
constexpr
(
H
==
2
&&
W
==
2
)
{
{
constexpr
auto
b_offset_0 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 0));
constexpr
index_t
b_offset_0
=
BDesc
{}.
CalculateOffset
(
make_tuple
(
e
,
0
,
0
,
0
));
constexpr
auto
b_offset_1 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 1));
constexpr
index_t
b_offset_1
=
BDesc
{}.
CalculateOffset
(
make_tuple
(
e
,
0
,
0
,
1
));
constexpr
auto
b_offset_2 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 0));
constexpr
index_t
b_offset_2
=
BDesc
{}.
CalculateOffset
(
make_tuple
(
e
,
0
,
1
,
0
));
constexpr
auto
b_offset_3 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 1));
constexpr
index_t
b_offset_3
=
BDesc
{}.
CalculateOffset
(
make_tuple
(
e
,
0
,
1
,
1
));
constexpr
auto
c_offset_0 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 0));
constexpr
index_t
c_offset_0
=
CDesc
{}.
CalculateOffset
(
make_tuple
(
k
,
0
,
0
,
0
));
constexpr
auto
c_offset_1 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 1));
constexpr
index_t
c_offset_1
=
CDesc
{}.
CalculateOffset
(
make_tuple
(
k
,
0
,
0
,
1
));
constexpr
auto
c_offset_2 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 0));
constexpr
index_t
c_offset_2
=
CDesc
{}.
CalculateOffset
(
make_tuple
(
k
,
0
,
1
,
0
));
constexpr
auto
c_offset_3 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 1));
constexpr
index_t
c_offset_3
=
CDesc
{}.
CalculateOffset
(
make_tuple
(
k
,
0
,
1
,
1
));
amd_assembly_outer_product_1x4(p_a[a_offset],
amd_assembly_outer_product_1x4
(
p_a
[
Number
<
a_offset
>
{}
],
p_b[b_offset_0],
p_b
[
Number
<
b_offset_0
>
{}
],
p_b[b_offset_1],
p_b
[
Number
<
b_offset_1
>
{}
],
p_b[b_offset_2],
p_b
[
Number
<
b_offset_2
>
{}
],
p_b[b_offset_3],
p_b
[
Number
<
b_offset_3
>
{}
],
p_c[c_offset_0],
p_c
[
Number
<
c_offset_0
>
{}
],
p_c[c_offset_1],
p_c
[
Number
<
c_offset_1
>
{}
],
p_c[c_offset_2],
p_c
[
Number
<
c_offset_2
>
{}
],
p_c[c_offset_3]);
p_c
[
Number
<
c_offset_3
>
{}
]);
}
}
else
if
constexpr
(
H
==
4
&&
W
==
1
)
else
if
constexpr
(
H
==
4
&&
W
==
1
)
{
{
constexpr
auto
b_offset_0 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 0));
constexpr
index_t
b_offset_0
=
BDesc
{}.
CalculateOffset
(
make_tuple
(
e
,
0
,
0
,
0
));
constexpr
auto
b_offset_1 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 0));
constexpr
index_t
b_offset_1
=
BDesc
{}.
CalculateOffset
(
make_tuple
(
e
,
0
,
1
,
0
));
constexpr
auto
b_offset_2 = BDesc{}.CalculateOffset(make_tuple(e, 0, 2, 0));
constexpr
index_t
b_offset_2
=
BDesc
{}.
CalculateOffset
(
make_tuple
(
e
,
0
,
2
,
0
));
constexpr
auto
b_offset_3 = BDesc{}.CalculateOffset(make_tuple(e, 0, 3, 0));
constexpr
index_t
b_offset_3
=
BDesc
{}.
CalculateOffset
(
make_tuple
(
e
,
0
,
3
,
0
));
constexpr
auto
c_offset_0 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 0));
constexpr
index_t
c_offset_0
=
CDesc
{}.
CalculateOffset
(
make_tuple
(
k
,
0
,
0
,
0
));
constexpr
auto
c_offset_1 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 0));
constexpr
index_t
c_offset_1
=
CDesc
{}.
CalculateOffset
(
make_tuple
(
k
,
0
,
1
,
0
));
constexpr
auto
c_offset_2 = CDesc{}.CalculateOffset(make_tuple(k, 0, 2, 0));
constexpr
index_t
c_offset_2
=
CDesc
{}.
CalculateOffset
(
make_tuple
(
k
,
0
,
2
,
0
));
constexpr
auto
c_offset_3 = CDesc{}.CalculateOffset(make_tuple(k, 0, 3, 0));
constexpr
index_t
c_offset_3
=
CDesc
{}.
CalculateOffset
(
make_tuple
(
k
,
0
,
3
,
0
));
amd_assembly_outer_product_1x4(p_a[a_offset],
amd_assembly_outer_product_1x4
(
p_a
[
Number
<
a_offset
>
{}
],
p_b[b_offset_0],
p_b
[
Number
<
b_offset_0
>
{}
],
p_b[b_offset_1],
p_b
[
Number
<
b_offset_1
>
{}
],
p_b[b_offset_2],
p_b
[
Number
<
b_offset_2
>
{}
],
p_b[b_offset_3],
p_b
[
Number
<
b_offset_3
>
{}
],
p_c[c_offset_0],
p_c
[
Number
<
c_offset_0
>
{}
],
p_c[c_offset_1],
p_c
[
Number
<
c_offset_1
>
{}
],
p_c[c_offset_2],
p_c
[
Number
<
c_offset_2
>
{}
],
p_c[c_offset_3]);
p_c
[
Number
<
c_offset_3
>
{}
]);
}
}
else
else
{
{
static_for
<
0
,
H
,
1
>
{}([
&
](
auto
h
)
{
static_for
<
0
,
H
,
1
>
{}([
&
](
auto
h
)
{
static_for
<
0
,
W
,
1
>
{}([
&
](
auto
w
)
{
static_for
<
0
,
W
,
1
>
{}([
&
](
auto
w
)
{
constexpr
auto
b_offset =
constexpr
index_t
b_offset
=
BDesc
{}.
CalculateOffset
(
make_tuple
(
e
,
0
,
h
,
w
));
BDesc
{}.
CalculateOffset
(
make_tuple
(
e
,
0
,
h
,
w
));
constexpr
auto
c_offset =
constexpr
index_t
c_offset
=
CDesc
{}.
CalculateOffset
(
make_tuple
(
k
,
0
,
h
,
w
));
CDesc
{}.
CalculateOffset
(
make_tuple
(
k
,
0
,
h
,
w
));
p_c[c_offset] += inner_product_with_conversion<FloatC>{}(p_a[a_offset],
#if 0
p_b[b_offset]);
p_c[Number<c_offset>{}] += inner_product_with_conversion<FloatC>{}(p_a[Number<a_offset>{}],
p_b[Number<b_offset>{}]);
#else
amd_assembly_inner_product
(
p_a
[
Number
<
a_offset
>
{}],
p_b
[
Number
<
b_offset
>
{}],
p_c
[
Number
<
c_offset
>
{}]);
#endif
});
});
});
});
}
}
#else
constexpr
index_t
a_offset
=
ADesc
{}.
CalculateOffset
(
make_tuple
(
e
,
k
));
static_for
<
0
,
H
,
1
>
{}([
&
](
auto
h
)
{
static_for
<
0
,
W
,
1
>
{}([
&
](
auto
w
)
{
constexpr
index_t
b_offset
=
BDesc
{}.
CalculateOffset
(
make_tuple
(
e
,
0
,
h
,
w
));
constexpr
index_t
c_offset
=
CDesc
{}.
CalculateOffset
(
make_tuple
(
k
,
0
,
h
,
w
));
amd_assembly_inner_product
(
p_a
[
Number
<
a_offset
>
{}],
p_b
[
Number
<
b_offset
>
{}],
p_c
[
Number
<
c_offset
>
{}]);
});
});
#endif
});
});
});
});
}
}
...
...
composable_kernel/include/utility/amd_inline_asm.hpp
View file @
f5654649
...
@@ -74,7 +74,6 @@ __device__ void amd_assembly_inner_product(const int8x16_t& a, const int8x16_t&
...
@@ -74,7 +74,6 @@ __device__ void amd_assembly_inner_product(const int8x16_t& a, const int8x16_t&
c
);
c
);
}
}
#if 0
// c0 += inner_product(a, b0)
// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
// c1 += inner_product(a, b1)
__device__
void
amd_assembly_outer_product_1x2
(
float
a
,
float
b0
,
float
b1
,
float
&
c0
,
float
&
c1
)
__device__
void
amd_assembly_outer_product_1x2
(
float
a
,
float
b0
,
float
b1
,
float
&
c0
,
float
&
c1
)
...
@@ -438,7 +437,6 @@ __device__ void amd_assembly_outer_product_1x4(int8x16_t a,
...
@@ -438,7 +437,6 @@ __device__ void amd_assembly_outer_product_1x4(int8x16_t a,
c2
,
c2
,
c3
);
c3
);
}
}
#endif
}
// namespace ck
}
// namespace ck
#endif
#endif
composable_kernel/include/utility/config.amd.hpp.in
View file @
f5654649
...
@@ -28,11 +28,11 @@
...
@@ -28,11 +28,11 @@
#endif
#endif
// launch bounds
// launch bounds
#define CK_USE_LAUNCH_BOUNDS
0
#define CK_USE_LAUNCH_BOUNDS
1
#ifdef CK_USE_LAUNCH_BOUNDS
#ifdef CK_USE_LAUNCH_BOUNDS
#define CK_MAX_THREAD_PER_BLOCK 256
#define CK_MAX_THREAD_PER_BLOCK 256
#define CK_MIN_BLOCK_PER_CU
2
#define CK_MIN_BLOCK_PER_CU
1
#endif
#endif
// buffer resourse
// buffer resourse
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment