Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
e6bb1dd7
Unverified
Commit
e6bb1dd7
authored
Jul 19, 2024
by
Po Yen Chen
Committed by
GitHub
Jul 19, 2024
Browse files
Merge branch 'develop' into feature/check-window-lengths
parents
9d6a3704
ab250afd
Changes
538
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1389 additions
and
27 deletions
+1389
-27
codegen/include/ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp
...k/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp
+60
-0
codegen/include/ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp
...t/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp
+56
-0
codegen/include/ck/host/headers.hpp
codegen/include/ck/host/headers.hpp
+0
-1
codegen/include/ck/host/operation/gemm.hpp
codegen/include/ck/host/operation/gemm.hpp
+1
-1
codegen/include/ck/host/stringutils.hpp
codegen/include/ck/host/stringutils.hpp
+1
-1
codegen/include/ck/host/types.hpp
codegen/include/ck/host/types.hpp
+13
-5
codegen/include/ck/host/utils.hpp
codegen/include/ck/host/utils.hpp
+3
-2
codegen/src/device_gemm_multiple_d.cpp
codegen/src/device_gemm_multiple_d.cpp
+10
-5
codegen/src/device_gemm_multiple_d_operation_xdl_cshuffle.cpp
...gen/src/device_gemm_multiple_d_operation_xdl_cshuffle.cpp
+54
-3
codegen/src/device_grouped_conv_fwd_multiple_abd.cpp
codegen/src/device_grouped_conv_fwd_multiple_abd.cpp
+42
-0
codegen/src/device_grouped_conv_fwd_multiple_abd_operation_xdl_cshuffle.cpp
..._grouped_conv_fwd_multiple_abd_operation_xdl_cshuffle.cpp
+364
-0
codegen/src/headers.cpp
codegen/src/headers.cpp
+1
-1
codegen/src/types.cpp
codegen/src/types.cpp
+8
-0
codegen/src/utils.cpp
codegen/src/utils.cpp
+1
-1
codegen/test/CMakeLists.txt
codegen/test/CMakeLists.txt
+8
-6
codegen/test/common.hpp
codegen/test/common.hpp
+134
-0
codegen/test/gemm_multiple_d.cpp
codegen/test/gemm_multiple_d.cpp
+6
-1
codegen/test/grouped_conv_fwd_multiple_d_v1.cpp
codegen/test/grouped_conv_fwd_multiple_d_v1.cpp
+209
-0
codegen/test/grouped_conv_fwd_multiple_d_v2.cpp
codegen/test/grouped_conv_fwd_multiple_d_v2.cpp
+209
-0
codegen/test/grouped_conv_fwd_multiple_d_v3.cpp
codegen/test/grouped_conv_fwd_multiple_d_v3.cpp
+209
-0
No files found.
codegen/include/ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp
0 → 100644
View file @
e6bb1dd7
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <vector>
#include <string>
#include "ck/host/types.hpp"
#include "ck/host/operation/gemm.hpp"
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp"
namespace
ck
{
namespace
host
{
namespace
conv
{
// defines the values needed for an instance of forward convolution and functions to return
// (templated) instances
struct
Operation_Conv_Fwd_Xdl_Cshuffle
{
// returns a vector of instances given the fusion operations, uses default values for problem
// spec
static
std
::
vector
<
Operation_Conv_Fwd_Xdl_Cshuffle
>
CreateOperations
(
const
std
::
string
&
prologue
,
const
std
::
string
&
epilogue
);
// returns a vector of instances, provided with a problem spec and fusion operations
static
std
::
vector
<
Operation_Conv_Fwd_Xdl_Cshuffle
>
CreateOperations
(
const
Problem_Conv_Fwd
&
prob
,
const
std
::
string
&
prologue
,
const
std
::
string
&
epilogue
);
std
::
size_t
NumDim
;
TensorDesc
A
{};
TensorDesc
B
{};
DataType
acc
=
DataType
::
Float
;
DataType
cs_type
=
DataType
::
Half
;
std
::
vector
<
TensorDesc
>
Ds
=
{};
TensorDesc
E
{};
std
::
string
a_elem_op
=
PassThrough
;
std
::
string
b_elem_op
=
PassThrough
;
std
::
string
cde_elem_op
=
PassThrough
;
std
::
string
prologue
=
""
;
std
::
string
epilogue
=
""
;
std
::
string
conv_specialization
=
"ck::tensor_operation::device::ConvolutionForwardSpecialization::Default"
;
std
::
string
gemm_specialization
=
"ck::tensor_operation::device::GemmSpecialization::MNKPadding"
;
// tuning parameters
operation
::
TileDesc
tile_desc
{};
operation
::
BlockTransferDesc
a_block_transfer
{};
operation
::
BlockTransferDesc
b_block_transfer
{};
operation
::
CShuffleDesc
cshuffle
{};
operation
::
CBlockTransferDesc
c_block_transfer
{};
// functions to update fusion operations if they are provided
void
update_prologue
(
const
std
::
string
&
prologue
);
void
update_epilogue
(
const
std
::
string
&
epilogue
);
// returns a templated instance
Solution
ToSolution
()
const
;
};
}
// namespace conv
}
// namespace host
}
// namespace ck
codegen/include/ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp
0 → 100644
View file @
e6bb1dd7
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <vector>
#include <memory>
#include <sstream>
#include <iterator>
#include <numeric>
#include "ck/host/types.hpp"
namespace
ck
{
namespace
host
{
namespace
conv
{
// defines the problem specification for a forward convolution operation
struct
Problem_Conv_Fwd
{
std
::
size_t
NumDim
=
0
;
// size of a forward convolution operation
std
::
size_t
G
=
0
;
std
::
size_t
N
=
0
;
std
::
size_t
C
=
0
;
std
::
size_t
Hi
=
0
;
std
::
size_t
Wi
=
0
;
std
::
size_t
Ho
=
0
;
std
::
size_t
Wo
=
0
;
std
::
size_t
K
=
0
;
std
::
size_t
Y
=
0
;
std
::
size_t
X
=
0
;
Layout
ALayout
=
Layout
::
NHWGC
;
Layout
BLayout
=
Layout
::
GKYXC
;
Layout
ELayout
=
Layout
::
NHWGK
;
std
::
vector
<
Layout
>
DsLayout
=
{};
DataType
ADataType
=
DataType
::
Half
;
DataType
BDataType
=
DataType
::
Half
;
DataType
EDataType
=
DataType
::
Half
;
std
::
vector
<
DataType
>
DsDataType
=
{};
std
::
string
AElementOp
=
"ck::tensor_operation::element_wise::PassThrough"
;
std
::
string
BElementOp
=
"ck::tensor_operation::element_wise::PassThrough"
;
std
::
string
CDEElementOp
=
"ck::tensor_operation::element_wise::PassThrough"
;
// returns the correct device op file for the operation
std
::
string
GetIncludeHeader
()
const
;
// returns a list of instances based on the problem spec and provided fusion operations
std
::
vector
<
Solution
>
GetSolutions
(
const
std
::
string
&
arch
,
const
std
::
string
&
prologue
,
const
std
::
string
&
epilogue
)
const
;
};
}
// namespace conv
}
// namespace host
}
// namespace ck
codegen/include/ck/host/headers.hpp
View file @
e6bb1dd7
...
@@ -4,7 +4,6 @@
...
@@ -4,7 +4,6 @@
#pragma once
#pragma once
#include <string>
#include <string>
#include <string_view>
#include <utility>
#include <utility>
#include <unordered_map>
#include <unordered_map>
#include <vector>
#include <vector>
...
...
codegen/include/ck/host/operation/gemm.hpp
View file @
e6bb1dd7
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
...
codegen/include/ck/host/stringutils.hpp
View file @
e6bb1dd7
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
...
codegen/include/ck/host/types.hpp
View file @
e6bb1dd7
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
@@ -12,6 +12,7 @@
...
@@ -12,6 +12,7 @@
namespace
ck
{
namespace
ck
{
namespace
host
{
namespace
host
{
// holds the templated instance, substitues values into template from instancess
struct
Solution
struct
Solution
{
{
...
@@ -33,6 +34,7 @@ struct Solution
...
@@ -33,6 +34,7 @@ struct Solution
std
::
unordered_map
<
std
::
string
,
std
::
string
>
template_values
;
std
::
unordered_map
<
std
::
string
,
std
::
string
>
template_values
;
};
};
// supported data types
enum
class
DataType
enum
class
DataType
{
{
Half
,
Half
,
...
@@ -40,22 +42,28 @@ enum class DataType
...
@@ -40,22 +42,28 @@ enum class DataType
Int8
,
Int8
,
Int32
Int32
};
};
std
::
string
ToString
(
DataType
dt
);
std
::
string
ToString
(
DataType
dt
);
// supported layouts: gemm and fwd conv
enum
class
Layout
enum
class
Layout
{
{
Row
,
Row
,
Column
Column
,
GKYXC
,
GKCYX
,
GNHWK
,
GNHWC
,
NHWGC
,
NHWGK
};
};
std
::
string
ToString
(
Layout
dl
);
std
::
string
ToString
(
Layout
dl
);
Layout
ToLayout
(
bool
Trans
);
// returns the layout for gemm
// supported GEMM types
enum
class
GemmType
enum
class
GemmType
{
{
Default
Default
};
};
std
::
string
ToString
(
GemmType
gt
);
std
::
string
ToString
(
GemmType
gt
);
struct
TensorDesc
struct
TensorDesc
...
...
codegen/include/ck/host/utils.hpp
View file @
e6bb1dd7
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include <cstdint>
#include <cstdint>
#include <unordered_set>
#include <unordered_set>
#include <numeric>
#include <iterator>
namespace
ck
{
namespace
ck
{
namespace
host
{
namespace
host
{
...
@@ -12,6 +14,5 @@ namespace host {
...
@@ -12,6 +14,5 @@ namespace host {
std
::
size_t
integer_divide_ceil
(
std
::
size_t
x
,
std
::
size_t
y
);
std
::
size_t
integer_divide_ceil
(
std
::
size_t
x
,
std
::
size_t
y
);
const
std
::
unordered_set
<
std
::
string
>&
get_xdlop_archs
();
const
std
::
unordered_set
<
std
::
string
>&
get_xdlop_archs
();
}
// namespace host
}
// namespace host
}
// namespace ck
}
// namespace ck
codegen/src/device_gemm_multiple_d.cpp
View file @
e6bb1dd7
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/host/device_gemm_multiple_d/problem.hpp"
#include "ck/host/device_gemm_multiple_d/problem.hpp"
#include "ck/host/device_gemm_multiple_d/operation.hpp"
#include "ck/host/device_gemm_multiple_d/operation.hpp"
...
@@ -11,19 +11,24 @@ namespace ck {
...
@@ -11,19 +11,24 @@ namespace ck {
namespace
host
{
namespace
host
{
namespace
device_gemm_multiple_d
{
namespace
device_gemm_multiple_d
{
// return the relevant device op file based on the operation
std
::
string
Problem
::
GetIncludeHeader
()
const
std
::
string
Problem
::
GetIncludeHeader
()
const
{
{
return
"ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
;
return
"ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
;
}
}
std
::
vector
<
Solution
>
Problem
::
GetSolutions
(
const
std
::
string
&
arch
)
const
// returns templated instances when provided with a problem specification
std
::
vector
<
Solution
>
Problem
::
GetSolutions
(
const
std
::
string
&
arch
,
const
std
::
string
&
prologue
,
const
std
::
string
&
epilogue
)
const
{
{
if
(
get_xdlop_archs
().
count
(
arch
)
==
0
)
if
(
get_xdlop_archs
().
count
(
arch
)
==
0
)
return
{};
return
{};
auto
ops
=
ck
::
host
::
device_gemm_multiple_d
::
Operation_Xdl_CShuffle
::
CreateOperations
(
*
this
);
auto
ops
=
ck
::
host
::
device_gemm_multiple_d
::
Operation_Xdl_CShuffle
::
CreateOperations
(
*
this
,
prologue
,
epilogue
);
// obtains vector of instances
std
::
vector
<
Solution
>
result
;
std
::
vector
<
Solution
>
result
;
std
::
transform
(
ops
.
begin
(),
ops
.
end
(),
std
::
back_inserter
(
result
),
[
&
](
const
auto
&
op
)
{
std
::
transform
(
ops
.
begin
(),
ops
.
end
(),
std
::
back_inserter
(
result
),
[
&
](
const
auto
&
op
)
{
return
op
.
ToSolution
();
return
op
.
ToSolution
();
// template instance with correct values
});
});
return
result
;
return
result
;
}
}
...
...
codegen/src/device_gemm_multiple_d_operation_xdl_cshuffle.cpp
View file @
e6bb1dd7
...
@@ -10,6 +10,7 @@ namespace ck {
...
@@ -10,6 +10,7 @@ namespace ck {
namespace
host
{
namespace
host
{
namespace
device_gemm_multiple_d
{
namespace
device_gemm_multiple_d
{
// calculate appropriate Gemm Specification based on input tensor dimensions
static
std
::
string
GetGemmSpec
(
const
std
::
size_t
m
,
static
std
::
string
GetGemmSpec
(
const
std
::
size_t
m
,
const
std
::
size_t
n
,
const
std
::
size_t
n
,
const
std
::
size_t
k
,
const
std
::
size_t
k
,
...
@@ -30,9 +31,40 @@ static std::string GetGemmSpec(const std::size_t m,
...
@@ -30,9 +31,40 @@ static std::string GetGemmSpec(const std::size_t m,
return
"ck::tensor_operation::device::GemmSpecialization::"
+
spec
+
"Padding"
;
return
"ck::tensor_operation::device::GemmSpecialization::"
+
spec
+
"Padding"
;
}
}
// function to update prologue/epilogue with user provided operation
void
Operation_Xdl_CShuffle
::
update_prologue
(
const
std
::
string
&
prologue
)
{
if
(
!
prologue
.
empty
())
{
this
->
prologue
=
prologue
;
this
->
cde_elem_op
=
"CDEElementOp"
;
}
else
{
this
->
prologue
=
""
;
}
}
void
Operation_Xdl_CShuffle
::
update_epilogue
(
const
std
::
string
&
epilogue
)
{
if
(
!
epilogue
.
empty
())
{
this
->
epilogue
=
epilogue
;
this
->
cde_elem_op
=
"CDEElementOp"
;
}
else
{
this
->
epilogue
=
""
;
}
}
// accounts for all possible combinations of Row/Col major
static
Layout
ToLayout
(
bool
Trans
)
{
return
Trans
?
Layout
::
Column
:
Layout
::
Row
;
}
static
Layout
ToLayout
(
bool
Trans
)
{
return
Trans
?
Layout
::
Column
:
Layout
::
Row
;
}
std
::
vector
<
Operation_Xdl_CShuffle
>
Operation_Xdl_CShuffle
::
CreateOperations
(
const
Problem
&
prob
)
// Hard-code tuning parameters in modularized fashion, string them together into a vector of
// instances
std
::
vector
<
Operation_Xdl_CShuffle
>
Operation_Xdl_CShuffle
::
CreateOperations
(
const
Problem
&
prob
,
const
std
::
string
&
prologue
,
const
std
::
string
&
epilogue
)
{
{
std
::
vector
<
Operation_Xdl_CShuffle
>
result
;
std
::
vector
<
Operation_Xdl_CShuffle
>
result
;
...
@@ -155,6 +187,7 @@ std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(con
...
@@ -155,6 +187,7 @@ std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(con
// clang-format on
// clang-format on
};
};
// choose correct arrangement of tuning parameters based on the layout of each tensor
const
auto
a_block_descriptions
=
const
auto
a_block_descriptions
=
prob
.
TransA
?
a_block_descriptions_colmajor
:
a_block_descriptions_rowmajor
;
prob
.
TransA
?
a_block_descriptions_colmajor
:
a_block_descriptions_rowmajor
;
const
auto
b_block_descriptions
=
const
auto
b_block_descriptions
=
...
@@ -165,6 +198,7 @@ std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(con
...
@@ -165,6 +198,7 @@ std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(con
assert
(
tile_descriptions
.
size
()
==
cshuffle_descriptions
.
size
());
assert
(
tile_descriptions
.
size
()
==
cshuffle_descriptions
.
size
());
assert
(
tile_descriptions
.
size
()
==
c_block_descriptions
.
size
());
assert
(
tile_descriptions
.
size
()
==
c_block_descriptions
.
size
());
// Put all values together into a single operation > store into the result vector
for
(
std
::
size_t
i
=
0
;
i
<
tile_descriptions
.
size
();
i
++
)
for
(
std
::
size_t
i
=
0
;
i
<
tile_descriptions
.
size
();
i
++
)
{
{
Operation_Xdl_CShuffle
x
;
Operation_Xdl_CShuffle
x
;
...
@@ -188,12 +222,17 @@ std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(con
...
@@ -188,12 +222,17 @@ std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(con
x
.
tile_desc
.
m_per_block
,
x
.
tile_desc
.
m_per_block
,
x
.
tile_desc
.
n_per_block
,
x
.
tile_desc
.
n_per_block
,
x
.
tile_desc
.
k_per_block
);
x
.
tile_desc
.
k_per_block
);
x
.
update_prologue
(
prologue
);
x
.
update_epilogue
(
epilogue
);
result
.
push_back
(
x
);
result
.
push_back
(
x
);
}
}
return
result
;
return
result
;
}
}
std
::
vector
<
std
::
vector
<
Operation_Xdl_CShuffle
>>
Operation_Xdl_CShuffle
::
CreateOperations
()
// set up instances when not provided with a problem specification, use default operation values and
// all possible layout combinations
std
::
vector
<
std
::
vector
<
Operation_Xdl_CShuffle
>>
Operation_Xdl_CShuffle
::
CreateOperations
(
const
std
::
string
&
prologue
,
const
std
::
string
&
epilogue
)
{
{
std
::
vector
<
Problem
>
problems
;
std
::
vector
<
Problem
>
problems
;
for
(
bool
TransA
:
{
true
,
false
})
for
(
bool
TransA
:
{
true
,
false
})
...
@@ -204,7 +243,8 @@ std::vector<std::vector<Operation_Xdl_CShuffle>> Operation_Xdl_CShuffle::CreateO
...
@@ -204,7 +243,8 @@ std::vector<std::vector<Operation_Xdl_CShuffle>> Operation_Xdl_CShuffle::CreateO
prob
.
TransB
=
TransB
;
prob
.
TransB
=
TransB
;
problems
.
push_back
(
prob
);
problems
.
push_back
(
prob
);
}
}
return
Transform
(
problems
,
[](
const
Problem
&
p
)
{
return
CreateOperations
(
p
);
});
return
Transform
(
problems
,
[
&
](
const
Problem
&
p
)
{
return
CreateOperations
(
p
,
prologue
,
epilogue
);
});
}
}
static
const
char
*
const
DeviceGemmMultipleD_Xdl_CShuffleTemplate
=
static
const
char
*
const
DeviceGemmMultipleD_Xdl_CShuffleTemplate
=
...
@@ -224,9 +264,20 @@ static const char* const DeviceGemmMultipleD_Xdl_CShuffleTemplate =
...
@@ -224,9 +264,20 @@ static const char* const DeviceGemmMultipleD_Xdl_CShuffleTemplate =
"${CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock}, "
"${CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock}, "
"${CDEBlockTransferScalarPerVector_NPerBlock}>"
;
"${CDEBlockTransferScalarPerVector_NPerBlock}>"
;
// use hardcoded instances from vector of operations to substitute values into instance template
Solution
Operation_Xdl_CShuffle
::
ToSolution
()
const
Solution
Operation_Xdl_CShuffle
::
ToSolution
()
const
{
{
std
::
unordered_map
<
std
::
string
,
std
::
string
>
values
=
{
std
::
unordered_map
<
std
::
string
,
std
::
string
>
values
=
{
{
"name"
,
std
::
to_string
(
this
->
tile_desc
.
block_size
)
+
"_"
+
std
::
to_string
(
this
->
tile_desc
.
m_per_block
)
+
"_"
+
std
::
to_string
(
this
->
tile_desc
.
n_per_block
)
+
"_"
+
std
::
to_string
(
this
->
tile_desc
.
k_per_block
)
+
"_"
+
std
::
to_string
(
this
->
tile_desc
.
ak1
)
+
"_"
+
std
::
to_string
(
this
->
tile_desc
.
bk1
)
+
"_"
+
std
::
to_string
(
this
->
tile_desc
.
m_per_XDL
)
+
"_"
+
std
::
to_string
(
this
->
tile_desc
.
n_per_XDL
)
+
"_"
+
std
::
to_string
(
this
->
tile_desc
.
m_Xdl_per_wave
)
+
"_"
+
std
::
to_string
(
this
->
tile_desc
.
n_Xdl_per_wave
)},
{
"LayoutA"
,
ToString
(
this
->
A
.
layout
)},
{
"LayoutA"
,
ToString
(
this
->
A
.
layout
)},
{
"LayoutB"
,
ToString
(
this
->
B
.
layout
)},
{
"LayoutB"
,
ToString
(
this
->
B
.
layout
)},
{
"LayoutDs"
,
{
"LayoutDs"
,
...
...
codegen/src/device_grouped_conv_fwd_multiple_abd.cpp
0 → 100644
View file @
e6bb1dd7
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp"
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
#include "ck/host/utils.hpp"
#include <algorithm>
#include <iostream>
namespace
ck
{
namespace
host
{
namespace
conv
{
// return the relevant device op file based on the operation
// NOTE: this is a modified version of the original CK file that calls the kernel from a device
// function and makes the Argument class accessible on the device
std
::
string
Problem_Conv_Fwd
::
GetIncludeHeader
()
const
{
return
"ck/tensor_operation/gpu/device/impl/"
"codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
;
}
// return vector of forward convolution instances when provided with a problem instance
std
::
vector
<
Solution
>
Problem_Conv_Fwd
::
GetSolutions
(
const
std
::
string
&
arch
,
const
std
::
string
&
prologue
,
const
std
::
string
&
epilogue
)
const
{
if
(
get_xdlop_archs
().
count
(
arch
)
==
0
)
return
{};
auto
ops
=
ck
::
host
::
conv
::
Operation_Conv_Fwd_Xdl_Cshuffle
::
CreateOperations
(
*
this
,
prologue
,
epilogue
);
std
::
vector
<
Solution
>
result
;
std
::
transform
(
ops
.
begin
(),
ops
.
end
(),
std
::
back_inserter
(
result
),
[
&
](
const
auto
&
op
)
{
return
op
.
ToSolution
();
});
return
result
;
}
}
// namespace conv
}
// namespace host
}
// namespace ck
codegen/src/device_grouped_conv_fwd_multiple_abd_operation_xdl_cshuffle.cpp
0 → 100644
View file @
e6bb1dd7
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
#include <iostream>
#include "ck/host/stringutils.hpp"
#include "ck/host/utils.hpp"
#include <cassert>
namespace
ck
{
namespace
host
{
namespace
conv
{
// calculate appropriate Gemm Specification based on input tensor dimensions
// NOTE: in CK, MNKPadding is always used for forward convolution
static
std
::
string
GetGemmSpec
(
const
std
::
size_t
m
,
const
std
::
size_t
n
,
const
std
::
size_t
k
,
const
std
::
size_t
m_per_block
,
const
std
::
size_t
n_per_block
,
const
std
::
size_t
k_per_block
)
{
std
::
string
spec
=
""
;
if
(
integer_divide_ceil
(
m
,
m_per_block
)
*
m_per_block
-
m
!=
0
)
spec
+=
"M"
;
if
(
integer_divide_ceil
(
n
,
n_per_block
)
*
n_per_block
-
n
!=
0
)
spec
+=
"N"
;
if
(
integer_divide_ceil
(
k
,
k_per_block
)
*
k_per_block
-
k
!=
0
)
spec
+=
"K"
;
if
(
spec
==
""
)
return
"ck::tensor_operation::device::GemmSpecialization::Default"
;
return
"ck::tensor_operation::device::GemmSpecialization::"
+
spec
+
"Padding"
;
}
// function to update prologue/epilogue with user provided operation
void
Operation_Conv_Fwd_Xdl_Cshuffle
::
update_prologue
(
const
std
::
string
&
prologue
)
{
if
(
!
prologue
.
empty
())
{
this
->
prologue
=
prologue
;
this
->
cde_elem_op
=
"CDEElementOp"
;
}
else
{
this
->
prologue
=
""
;
}
}
void
Operation_Conv_Fwd_Xdl_Cshuffle
::
update_epilogue
(
const
std
::
string
&
epilogue
)
{
if
(
!
epilogue
.
empty
())
{
this
->
epilogue
=
epilogue
;
this
->
cde_elem_op
=
"CDEElementOp"
;
}
else
{
this
->
epilogue
=
""
;
}
}
// Hard-code tuning parameters in modularized fashion, string them together into a vector of
// instances
std
::
vector
<
Operation_Conv_Fwd_Xdl_Cshuffle
>
Operation_Conv_Fwd_Xdl_Cshuffle
::
CreateOperations
(
const
Problem_Conv_Fwd
&
prob
,
const
std
::
string
&
prologue
,
const
std
::
string
&
epilogue
)
{
std
::
vector
<
Operation_Conv_Fwd_Xdl_Cshuffle
>
result
;
std
::
vector
<
operation
::
TileDesc
>
tile_descriptions
=
{
// clang-format off
// Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| NumGemmK|
// Size| Block| Block| Block| | | XDL| XDL| Per| Per| Prefetch|
// | | | | | | | | Wave| Wave| Stage|
// | | | | | | | | | | |
{
64
,
64
,
32
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
1
},
{
256
,
128
,
256
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
1
},
{
256
,
128
,
128
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
1
},
{
64
,
64
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
1
},
{
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
1
},
{
128
,
128
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
1
}
// clang-format on
};
std
::
vector
<
operation
::
BlockTransferDesc
>
a_block_descriptions
=
{
// clang-format off
// ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|
// ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM|
// Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| |
// | | | | | | |
{
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
},
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
},
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
8
,
1
},
{
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
8
,
1
},
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
},
{
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
}
// clang-format on
};
std
::
vector
<
operation
::
BlockTransferDesc
>
b_block_descriptions
=
{
// clang-format off
// BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|
// ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN|
// Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| |
// | | | | | | |
{
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
},
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
},
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
8
,
1
},
{
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
8
,
1
},
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
},
{
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
}
// clang-format on
};
std
::
vector
<
operation
::
CShuffleDesc
>
cshuffle_descriptions
=
{
// clang-format off
// CShuffle| CShuffle|
// MXdlPerWave| NXdlPerWave|
// PerShuffle| PerShuffle|
// | |
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
}
// clang-format on
};
std
::
vector
<
operation
::
CBlockTransferDesc
>
c_block_descriptions
=
{
// clang-format off
// CBlockTransferClusterLengths| CBlockTransfer
// _MBlock_MWaveMPerXdl| ScalarPerVector
// _NBlock_NWaveNPerXdl| _NWaveNPerXdl
// |
{
S
<
1
,
16
,
1
,
4
>
,
1
},
{
S
<
1
,
32
,
1
,
8
>
,
8
},
{
S
<
1
,
32
,
1
,
8
>
,
8
},
{
S
<
1
,
16
,
1
,
4
>
,
1
},
{
S
<
1
,
32
,
1
,
8
>
,
8
},
{
S
<
1
,
16
,
1
,
8
>
,
8
}
// clang-format on
};
assert
(
tile_descriptions
.
size
()
==
a_block_descriptions
.
size
());
assert
(
tile_descriptions
.
size
()
==
b_block_descriptions
.
size
());
assert
(
tile_descriptions
.
size
()
==
cshuffle_descriptions
.
size
());
assert
(
tile_descriptions
.
size
()
==
c_block_descriptions
.
size
());
// Put all values together into a single operation > store into the result vector
for
(
std
::
size_t
i
=
0
;
i
<
tile_descriptions
.
size
();
i
++
)
{
Operation_Conv_Fwd_Xdl_Cshuffle
x
;
x
.
NumDim
=
prob
.
NumDim
;
x
.
tile_desc
=
tile_descriptions
[
i
];
x
.
a_block_transfer
=
a_block_descriptions
[
i
];
x
.
b_block_transfer
=
b_block_descriptions
[
i
];
x
.
cshuffle
=
cshuffle_descriptions
[
i
];
x
.
c_block_transfer
=
c_block_descriptions
[
i
];
x
.
A
=
TensorDesc
{
prob
.
ADataType
,
prob
.
ALayout
};
x
.
B
=
TensorDesc
{
prob
.
BDataType
,
prob
.
BLayout
};
x
.
E
=
TensorDesc
{
prob
.
EDataType
,
prob
.
ELayout
};
x
.
Ds
=
Transform
(
prob
.
DsLayout
,
prob
.
DsDataType
,
[](
auto
lo
,
auto
dt
)
{
return
TensorDesc
{
dt
,
lo
};
});
x
.
a_elem_op
=
prob
.
AElementOp
;
x
.
b_elem_op
=
prob
.
BElementOp
;
x
.
cde_elem_op
=
prob
.
CDEElementOp
;
x
.
update_prologue
(
prologue
);
x
.
update_epilogue
(
epilogue
);
result
.
push_back
(
x
);
}
return
result
;
}
// set up instances when not provided with a problem specification, use default operation values
std
::
vector
<
Operation_Conv_Fwd_Xdl_Cshuffle
>
Operation_Conv_Fwd_Xdl_Cshuffle
::
CreateOperations
(
const
std
::
string
&
prologue
,
const
std
::
string
&
epilogue
)
{
Problem_Conv_Fwd
prob
;
return
CreateOperations
(
prob
,
prologue
,
epilogue
);
}
static
const
char
*
const
CopyDevice_ConvTemplate
=
R"(
${Prologue}
${Epilogue}
using CDEElementOp = Epilogue;
using DeviceConv = ck::tensor_operation::device::CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<${NumDim}, ${LayoutA}, ${LayoutB}, ${LayoutDs}, ${LayoutE}, ${ADataType}, ${BDataType}, ${AccDataType}, ${CShuffleDataType}, ${DsDataType}, ${EDataType}, ${AElementwiseOperation}, ${BElementwiseOperation}, ${CDEElementwiseOperation}, ${ConvSpecialization}, ${GemmSpecialization}, ${NumGemmkPrefetchStage}, ${BlockSize}, ${MPerBlock}, ${NPerBlock}, ${KPerBlock}, ${AK1}, ${BK1}, ${MPerXDL}, ${NPerXDL}, ${MXdlPerWave}, ${NXdlPerWave}, ${ABlockTransferThreadClusterLengths_AK0_M_AK1}, ${ABlockTransferThreadClusterArrangeOrder}, ${ABlockTransferSrcAccessOrder}, ${ABlockTransferSrcVectorDim}, ${ABlockTransferSrcScalarPerVector}, ${ABlockTransferDstScalarPerVector_AK1}, ${ABlockLdsExtraM}, ${BBlockTransferThreadClusterLengths_BK0_N_BK1}, ${BBlockTransferThreadClusterArrangeOrder}, ${BBlockTransferSrcAccessOrder}, ${BBlockTransferSrcVectorDim}, ${BBlockTransferSrcScalarPerVector}, ${BBlockTransferDstScalarPerVector_BK1}, ${BBlockLdsExtraN}, ${CShuffleMXdlPerWavePerShuffle}, ${CShuffleNXdlPerWavePerShuffle}, ${CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock}, ${CDEBlockTransferScalarPerVector_NPerBlock}>;
constexpr ck::index_t NumATensor = ck::tensor_operation::device::GetNumABTensors<false, ${ADataType}>();
constexpr ck::index_t NumBTensor = ck::tensor_operation::device::GetNumABTensors<false, ${BDataType}>();
extern "C" __global__ void run_${name}(
const ${ADataType}* in_dev,
const ${BDataType}* wei_dev,
${EDataType}* __restrict__ out_dev,
ck::Array<ck::index_t, ${NumDim} + 3> in_lengths,
ck::Array<ck::index_t, ${NumDim} + 3> in_strides,
ck::Array<ck::index_t, ${NumDim} + 3> wei_lengths,
ck::Array<ck::index_t, ${NumDim} + 3> wei_strides,
ck::Array<ck::index_t, ${NumDim} + 3> out_lengths,
ck::Array<ck::index_t, ${NumDim} + 3> out_strides,
ck::Array<ck::index_t, ${NumDim}> conv_filter_strides,
ck::Array<ck::index_t, ${NumDim}> conv_filter_dilations,
ck::Array<ck::index_t, ${NumDim}> input_left_pads,
ck::Array<ck::index_t, ${NumDim}> input_right_pads,
const ${AElementwiseOperation} a_element_op,
const ${BElementwiseOperation} b_element_op,
const ${CDEElementwiseOperation} cde_element_op
){
auto arg = DeviceConv::Argument(in_dev,
wei_dev,
ck::Array<const void*, 0>{},
out_dev,
in_lengths,
in_strides,
wei_lengths,
wei_strides,
ck::Array<ck::Array<ck::index_t, ${NumDim} + 3>, 0>{},
ck::Array<ck::Array<ck::index_t, ${NumDim} + 3>, 0>{},
out_lengths,
out_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
${AElementwiseOperation}{},
${BElementwiseOperation}{},
${CDEElementwiseOperation}{1.0f, 1.0f});
constexpr ck::LoopScheduler LoopSched = ck::make_default_loop_scheduler();
// GridwiseGemm
using GridwiseGemm = DeviceConv::GridwiseGemm;
static constexpr auto I0 = ck::Number<0>{};
ck::tensor_operation::device::device_grouped_conv_fwd_multiple_abd_xdl_cshuffle<
GridwiseGemm,
const ${ADataType}*,
const ${BDataType}*,
typename GridwiseGemm::DsGridPointer,
${EDataType},
${AElementwiseOperation},
${BElementwiseOperation},
${CDEElementwiseOperation},
DeviceConv::AGridDesc_AK0_M_AK1,
DeviceConv::BGridDesc_BK0_N_BK1,
DeviceConv::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
DeviceConv::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
DeviceConv::Block2ETileMap,
ck::tensor_operation::device::ComputePtrOffsetOfStridedBatch<NumATensor, NumBTensor, 0>,
ck::integral_constant<bool, true>{},
false,
false>
(
arg.p_as_grid_.At(I0),
arg.p_bs_grid_.At(I0),
arg.p_ds_grid_,
arg.p_e_grid_,
arg.a_element_op_,
arg.b_element_op_,
arg.cde_element_op_,
arg.a_g_n_c_wis_lengths_[0], // Group count
arg.a_grid_desc_ak0_m_ak1_,
arg.b_grid_desc_bk0_n_bk1_,
arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
arg.block_2_etile_map_,
arg.compute_ptr_offset_of_batch_
);
}
)"
;
// use hardcoded instances from vector of operations to substitute values into instance template
Solution
Operation_Conv_Fwd_Xdl_Cshuffle
::
ToSolution
()
const
{
std
::
unordered_map
<
std
::
string
,
std
::
string
>
values
=
{
{
"name"
,
std
::
to_string
(
this
->
tile_desc
.
block_size
)
+
"_"
+
std
::
to_string
(
this
->
tile_desc
.
m_per_block
)
+
"_"
+
std
::
to_string
(
this
->
tile_desc
.
n_per_block
)
+
"_"
+
std
::
to_string
(
this
->
tile_desc
.
k_per_block
)
+
"_"
+
std
::
to_string
(
this
->
tile_desc
.
ak1
)
+
"_"
+
std
::
to_string
(
this
->
tile_desc
.
bk1
)
+
"_"
+
std
::
to_string
(
this
->
tile_desc
.
m_per_XDL
)
+
"_"
+
std
::
to_string
(
this
->
tile_desc
.
n_per_XDL
)
+
"_"
+
std
::
to_string
(
this
->
tile_desc
.
m_Xdl_per_wave
)
+
"_"
+
std
::
to_string
(
this
->
tile_desc
.
n_Xdl_per_wave
)},
{
"NumDim"
,
std
::
to_string
(
this
->
NumDim
)},
{
"LayoutA"
,
ToString
(
this
->
A
.
layout
)},
{
"LayoutB"
,
ToString
(
this
->
B
.
layout
)},
{
"LayoutDs"
,
MakeTuple
(
Transform
(
this
->
Ds
,
[](
auto
tensor
)
{
return
ToString
(
tensor
.
layout
);
}))},
{
"LayoutE"
,
ToString
(
this
->
E
.
layout
)},
{
"ADataType"
,
ToString
(
this
->
A
.
element
)},
{
"BDataType"
,
ToString
(
this
->
B
.
element
)},
{
"AccDataType"
,
ToString
(
this
->
acc
)},
{
"ComputeDataType"
,
ToString
(
this
->
A
.
element
)},
{
"CShuffleDataType"
,
ToString
(
this
->
cs_type
)},
{
"DsDataType"
,
MakeTuple
(
Transform
(
this
->
Ds
,
[](
auto
tensor
)
{
return
ToString
(
tensor
.
element
);
}))},
{
"EDataType"
,
ToString
(
this
->
E
.
element
)},
{
"AElementwiseOperation"
,
this
->
a_elem_op
},
{
"BElementwiseOperation"
,
this
->
b_elem_op
},
{
"CDEElementwiseOperation"
,
this
->
cde_elem_op
},
{
"Prologue"
,
this
->
prologue
},
{
"Epilogue"
,
this
->
epilogue
},
{
"ConvSpecialization"
,
this
->
conv_specialization
},
{
"GemmSpecialization"
,
this
->
gemm_specialization
},
{
"NumGemmkPrefetchStage"
,
std
::
to_string
(
this
->
tile_desc
.
num_gemmk_prefetch_stage
)},
{
"BlockSize"
,
std
::
to_string
(
this
->
tile_desc
.
block_size
)},
{
"MPerBlock"
,
std
::
to_string
(
this
->
tile_desc
.
m_per_block
)},
{
"NPerBlock"
,
std
::
to_string
(
this
->
tile_desc
.
n_per_block
)},
{
"KPerBlock"
,
std
::
to_string
(
this
->
tile_desc
.
k_per_block
)},
{
"AK1"
,
std
::
to_string
(
this
->
tile_desc
.
ak1
)},
{
"BK1"
,
std
::
to_string
(
this
->
tile_desc
.
bk1
)},
{
"MPerXDL"
,
std
::
to_string
(
this
->
tile_desc
.
m_per_XDL
)},
{
"NPerXDL"
,
std
::
to_string
(
this
->
tile_desc
.
n_per_XDL
)},
{
"MXdlPerWave"
,
std
::
to_string
(
this
->
tile_desc
.
m_Xdl_per_wave
)},
{
"NXdlPerWave"
,
std
::
to_string
(
this
->
tile_desc
.
n_Xdl_per_wave
)},
{
"ABlockTransferThreadClusterLengths_AK0_M_AK1"
,
this
->
a_block_transfer
.
thread_cluster_length
},
{
"ABlockTransferThreadClusterArrangeOrder"
,
this
->
a_block_transfer
.
thread_cluster_arrange_order
},
{
"ABlockTransferSrcAccessOrder"
,
this
->
a_block_transfer
.
src_access_order
},
{
"ABlockTransferSrcVectorDim"
,
std
::
to_string
(
this
->
a_block_transfer
.
src_vec_dim
)},
{
"ABlockTransferSrcScalarPerVector"
,
std
::
to_string
(
this
->
a_block_transfer
.
src_scalar_per_vector
)},
{
"ABlockTransferDstScalarPerVector_AK1"
,
std
::
to_string
(
this
->
a_block_transfer
.
dst_scalar_per_vector_k1
)},
{
"ABlockLdsExtraM"
,
std
::
to_string
(
this
->
a_block_transfer
.
lds_add_extra_dim
)},
{
"BBlockTransferThreadClusterLengths_BK0_N_BK1"
,
this
->
b_block_transfer
.
thread_cluster_length
},
{
"BBlockTransferThreadClusterArrangeOrder"
,
this
->
b_block_transfer
.
thread_cluster_arrange_order
},
{
"BBlockTransferSrcAccessOrder"
,
this
->
b_block_transfer
.
src_access_order
},
{
"BBlockTransferSrcVectorDim"
,
std
::
to_string
(
this
->
b_block_transfer
.
src_vec_dim
)},
{
"BBlockTransferSrcScalarPerVector"
,
std
::
to_string
(
this
->
b_block_transfer
.
src_scalar_per_vector
)},
{
"BBlockTransferDstScalarPerVector_BK1"
,
std
::
to_string
(
this
->
b_block_transfer
.
dst_scalar_per_vector_k1
)},
{
"BBlockLdsExtraN"
,
std
::
to_string
(
this
->
b_block_transfer
.
lds_add_extra_dim
)},
{
"CShuffleMXdlPerWavePerShuffle"
,
std
::
to_string
(
this
->
cshuffle
.
m_Xdl_per_wave_per_shuffle
)},
{
"CShuffleNXdlPerWavePerShuffle"
,
std
::
to_string
(
this
->
cshuffle
.
n_Xdl_per_wave_per_shuffle
)},
{
"CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock"
,
this
->
c_block_transfer
.
cluster_lengths_m_block_m_wave_m_per_Xdl_n_block_n_wave_n_per_Xdl
},
{
"CDEBlockTransferScalarPerVector_NPerBlock"
,
std
::
to_string
(
this
->
c_block_transfer
.
scalar_per_vector_n_wave_n_per_Xdl
)},
};
return
Solution
{
InterpolateString
(
CopyDevice_ConvTemplate
,
values
),
std
::
move
(
values
)};
}
}
// namespace conv
}
// namespace host
}
// namespace ck
codegen/src/headers.cpp
View file @
e6bb1dd7
codegen/src/types.cpp
View file @
e6bb1dd7
...
@@ -29,12 +29,20 @@ std::string ToString(DataType dt)
...
@@ -29,12 +29,20 @@ std::string ToString(DataType dt)
throw
std
::
runtime_error
(
"Incorrect data type"
);
throw
std
::
runtime_error
(
"Incorrect data type"
);
}
}
Layout
ToLayout
(
bool
Trans
)
{
return
Trans
?
Layout
::
Column
:
Layout
::
Row
;
}
std
::
string
ToString
(
Layout
dl
)
std
::
string
ToString
(
Layout
dl
)
{
{
switch
(
dl
)
switch
(
dl
)
{
{
case
Layout
::
Row
:
return
"ck::tensor_layout::gemm::RowMajor"
;
case
Layout
::
Row
:
return
"ck::tensor_layout::gemm::RowMajor"
;
case
Layout
::
Column
:
return
"ck::tensor_layout::gemm::ColumnMajor"
;
case
Layout
::
Column
:
return
"ck::tensor_layout::gemm::ColumnMajor"
;
case
Layout
::
GKCYX
:
return
"ck::tensor_layout::convolution::GKCYX"
;
case
Layout
::
GKYXC
:
return
"ck::tensor_layout::convolution::GKYXC"
;
case
Layout
::
GNHWK
:
return
"ck::tensor_layout::convolution::GNHWK"
;
case
Layout
::
GNHWC
:
return
"ck::tensor_layout::convolution::GNHWC"
;
case
Layout
::
NHWGC
:
return
"ck::tensor_layout::convolution::NHWGC"
;
case
Layout
::
NHWGK
:
return
"ck::tensor_layout::convolution::NHWGK"
;
}
}
throw
std
::
runtime_error
(
"Incorrect layout"
);
throw
std
::
runtime_error
(
"Incorrect layout"
);
}
}
...
...
codegen/src/utils.cpp
View file @
e6bb1dd7
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/host/utils.hpp"
#include "ck/host/utils.hpp"
...
...
codegen/test/CMakeLists.txt
View file @
e6bb1dd7
list
(
APPEND CMAKE_PREFIX_PATH /opt/rocm
)
list
(
APPEND CMAKE_PREFIX_PATH /opt/rocm
)
add_subdirectory
(
rtc
)
add_subdirectory
(
rtc
)
file
(
GLOB TEST_SRCS CONFIGURE_DEPENDS *.cpp
)
file
(
GLOB TEST_SRCS CONFIGURE_DEPENDS *.cpp
)
foreach
(
TEST_SRC
${
TEST_SRCS
}
)
foreach
(
TEST_SRC
${
TEST_SRCS
}
)
get_filename_component
(
BASE_NAME
${
TEST_SRC
}
NAME_WE
)
set_source_files_properties
(
${
TEST_SRC
}
PROPERTIES LANGUAGE HIP
)
rocm_add_test_executable
(
test_host_
${
BASE_NAME
}
${
TEST_SRC
}
)
get_filename_component
(
BASE_NAME
${
TEST_SRC
}
NAME_WE
)
target_link_libraries
(
test_host_
${
BASE_NAME
}
ck_rtc ck_host
)
rocm_add_test_executable
(
test_host_
${
BASE_NAME
}
${
TEST_SRC
}
)
target_include_directories
(
test_host_
${
BASE_NAME
}
PUBLIC
include
())
target_link_libraries
(
test_host_
${
BASE_NAME
}
ck_rtc ck_host
)
# target_link_libraries(test_host_${BASE_NAME} ${CK_ROOT}/build/lib/libutility.a)
target_include_directories
(
test_host_
${
BASE_NAME
}
PUBLIC
include
())
target_include_directories
(
test_host_
${
BASE_NAME
}
PUBLIC
${
CK_ROOT
}
/include
)
target_include_directories
(
test_host_
${
BASE_NAME
}
PUBLIC
${
CK_ROOT
}
/library/include
)
endforeach
()
endforeach
()
codegen/test/common.hpp
0 → 100644
View file @
e6bb1dd7
#pragma once
#include <algorithm>
#include <cmath>
#include <iterator>
#include <numeric>
#include <random>
#include <test.hpp>
#include <rtc/compile_kernel.hpp>
#include <rtc/hip.hpp>
#include <fstream>
std
::
vector
<
rtc
::
src_file
>
get_headers_for_test
()
{
std
::
vector
<
rtc
::
src_file
>
result
;
auto
hs
=
ck
::
host
::
GetHeaders
();
std
::
transform
(
hs
.
begin
(),
hs
.
end
(),
std
::
back_inserter
(
result
),
[
&
](
const
auto
&
p
)
->
rtc
::
src_file
{
return
{
p
.
first
,
p
.
second
};
});
return
result
;
}
template
<
typename
V
>
std
::
size_t
GetSize
(
V
mLens
,
V
mStrides
)
{
std
::
size_t
space
=
1
;
for
(
std
::
size_t
i
=
0
;
i
<
mLens
.
Size
();
++
i
)
{
if
(
mLens
[
i
]
==
0
)
continue
;
space
+=
(
mLens
[
i
]
-
1
)
*
mStrides
[
i
];
}
return
space
;
}
template
<
class
T
,
typename
V
>
rtc
::
buffer
<
T
>
generate_buffer
(
V
mLens
,
V
mStrides
,
std
::
size_t
seed
=
0
)
{
std
::
size_t
space
=
GetSize
(
mLens
,
mStrides
);
rtc
::
buffer
<
T
>
result
(
space
);
std
::
mt19937
gen
(
seed
);
std
::
uniform_real_distribution
<
double
>
dis
(
-
1.0
);
std
::
generate
(
result
.
begin
(),
result
.
end
(),
[
&
]
{
return
dis
(
gen
);
});
// std::fill(result.begin(), result.end(), 1);
return
result
;
}
template
<
class
T
,
class
U
>
bool
allclose
(
const
T
&
a
,
const
U
&
b
,
double
atol
=
0.01
,
double
rtol
=
0.01
)
{
return
std
::
equal
(
a
.
begin
(),
a
.
end
(),
b
.
begin
(),
b
.
end
(),
[
&
](
double
x
,
double
y
)
{
return
fabs
(
x
-
y
)
<
atol
+
rtol
*
fabs
(
y
);
});
}
std
::
string
classify
(
double
x
)
{
switch
(
std
::
fpclassify
(
x
))
{
case
FP_INFINITE
:
return
"inf"
;
case
FP_NAN
:
return
"nan"
;
case
FP_NORMAL
:
return
"normal"
;
case
FP_SUBNORMAL
:
return
"subnormal"
;
case
FP_ZERO
:
return
"zero"
;
default:
return
"unknown"
;
}
}
template
<
class
Buffer
>
void
print_classification
(
const
Buffer
&
x
)
{
std
::
unordered_set
<
std
::
string
>
result
;
for
(
const
auto
&
i
:
x
)
result
.
insert
(
classify
(
i
));
for
(
const
auto
&
c
:
result
)
std
::
cout
<<
c
<<
", "
;
std
::
cout
<<
std
::
endl
;
}
template
<
class
Buffer
>
void
print_statistics
(
const
Buffer
&
x
)
{
std
::
cout
<<
"Min value: "
<<
*
std
::
min_element
(
x
.
begin
(),
x
.
end
())
<<
", "
;
std
::
cout
<<
"Max value: "
<<
*
std
::
max_element
(
x
.
begin
(),
x
.
end
())
<<
", "
;
double
num_elements
=
x
.
size
();
auto
mean
=
std
::
accumulate
(
x
.
begin
(),
x
.
end
(),
double
{
0.0
},
std
::
plus
<
double
>
{})
/
num_elements
;
auto
stddev
=
std
::
sqrt
(
std
::
accumulate
(
x
.
begin
(),
x
.
end
(),
double
{
0.0
},
[
&
](
double
r
,
double
v
)
{
return
r
+
std
::
pow
((
v
-
mean
),
2.0
);
})
/
num_elements
);
std
::
cout
<<
"Mean: "
<<
mean
<<
", "
;
std
::
cout
<<
"StdDev: "
<<
stddev
<<
"
\n
"
;
}
template
<
class
Buffer
>
void
print_preview
(
const
Buffer
&
x
)
{
if
(
x
.
size
()
<=
10
)
{
std
::
for_each
(
x
.
begin
(),
x
.
end
(),
[
&
](
double
i
)
{
std
::
cout
<<
i
<<
", "
;
});
}
else
{
std
::
for_each
(
x
.
begin
(),
x
.
begin
()
+
5
,
[
&
](
double
i
)
{
std
::
cout
<<
i
<<
", "
;
});
std
::
cout
<<
"..., "
;
std
::
for_each
(
x
.
end
()
-
5
,
x
.
end
(),
[
&
](
double
i
)
{
std
::
cout
<<
i
<<
", "
;
});
}
std
::
cout
<<
std
::
endl
;
}
template
<
class
T
>
struct
check_all
{
rtc
::
buffer
<
T
>
data
{};
bool
operator
()(
const
rtc
::
buffer
<
T
>&
x
)
{
if
(
data
.
empty
())
{
data
=
x
;
return
true
;
}
return
allclose
(
data
,
x
);
}
};
template
<
class
Solution
>
auto
report
(
const
Solution
&
solution
,
bool
pass
)
{
return
test
::
make_predicate
(
solution
.
ToTemplateString
(),
[
=
]
{
return
pass
;
});
}
codegen/test/gemm_multiple_d.cpp
View file @
e6bb1dd7
...
@@ -10,6 +10,7 @@
...
@@ -10,6 +10,7 @@
#include <test.hpp>
#include <test.hpp>
#include <rtc/compile_kernel.hpp>
#include <rtc/compile_kernel.hpp>
#include <rtc/hip.hpp>
#include <rtc/hip.hpp>
#include <fstream>
using
half
=
_Float16
;
using
half
=
_Float16
;
// using half = __fp16;
// using half = __fp16;
...
@@ -159,7 +160,10 @@ TEST_CASE(test_problem_kernel)
...
@@ -159,7 +160,10 @@ TEST_CASE(test_problem_kernel)
auto
b
=
to_gpu
(
generate_buffer
<
half
>
(
1024
*
1024
,
1
));
auto
b
=
to_gpu
(
generate_buffer
<
half
>
(
1024
*
1024
,
1
));
auto
c
=
to_gpu
(
generate_buffer
<
half
>
(
1024
*
1024
,
2
));
auto
c
=
to_gpu
(
generate_buffer
<
half
>
(
1024
*
1024
,
2
));
for
(
auto
solution
:
prob
.
GetSolutions
(
"gfx90a"
))
std
::
string
epilogue
=
""
;
std
::
string
prologue
=
""
;
for
(
auto
solution
:
prob
.
GetSolutions
(
"gfx90a"
,
prologue
,
epilogue
))
{
{
auto
src
=
ck
::
host
::
InterpolateString
(
gemm_compile_check
,
auto
src
=
ck
::
host
::
InterpolateString
(
gemm_compile_check
,
{{
"include"
,
prob
.
GetIncludeHeader
()},
{{
"include"
,
prob
.
GetIncludeHeader
()},
...
@@ -178,6 +182,7 @@ TEST_CASE(test_problem_kernel)
...
@@ -178,6 +182,7 @@ TEST_CASE(test_problem_kernel)
auto
grid_size
=
ck
::
host
::
integer_divide_ceil
(
prob
.
M
,
m_per_block
)
*
auto
grid_size
=
ck
::
host
::
integer_divide_ceil
(
prob
.
M
,
m_per_block
)
*
ck
::
host
::
integer_divide_ceil
(
prob
.
N
,
n_per_block
);
ck
::
host
::
integer_divide_ceil
(
prob
.
N
,
n_per_block
);
k
.
launch
(
nullptr
,
grid_size
*
block_size
,
block_size
)(
a
.
data
(),
b
.
data
(),
c
.
data
());
k
.
launch
(
nullptr
,
grid_size
*
block_size
,
block_size
)(
a
.
data
(),
b
.
data
(),
c
.
data
());
CHECK
(
report
(
solution
,
check
(
rtc
::
from_gpu
(
c
))));
CHECK
(
report
(
solution
,
check
(
rtc
::
from_gpu
(
c
))));
}
}
}
}
...
...
codegen/test/grouped_conv_fwd_multiple_d_v1.cpp
0 → 100644
View file @
e6bb1dd7
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp"
#include "ck/host/headers.hpp"
#include "ck/host/stringutils.hpp"
#include "ck/host/utils.hpp"
#include "ck/tensor_operation/gpu/device/helper.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include <test.hpp>
#include <rtc/compile_kernel.hpp>
#include <rtc/hip.hpp>
#include "common.hpp"
#include <fstream>
// Need this for verification
/**struct Epilogue
{
Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
template <typename E, typename D>
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
template <>
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
const ck::half_t& d) const
{
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
}
float alpha_;
float beta_;
};**/
const
std
::
string
conv_compile_check
=
R"__ck__(
#include <${include}>
${template};
)__ck__"
;
TEST_CASE
(
test_problem_kernel
)
{
// set up problem specification
ck
::
host
::
conv
::
Problem_Conv_Fwd
prob
;
prob
.
NumDim
=
2
;
prob
.
G
=
32
;
prob
.
N
=
256
;
prob
.
C
=
32
;
prob
.
K
=
64
;
prob
.
Y
=
3
;
prob
.
X
=
3
;
prob
.
Hi
=
28
;
prob
.
Wi
=
28
;
prob
.
Ho
=
28
;
prob
.
Wo
=
28
;
check_all
<
ck
::
half_t
>
check
;
// user provided fusion operations
std
::
string
epilogue
=
R"(
struct Epilogue
{
__host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
template <typename E, typename D>
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
template <>
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
const ck::half_t& d) const
{
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
}
float alpha_;
float beta_;
};
)"
;
std
::
string
prologue
=
""
;
// length+stride arrays
ck
::
Array
<
ck
::
index_t
,
5
>
in_lengths
{
static_cast
<
int
>
(
prob
.
G
),
static_cast
<
int
>
(
prob
.
N
),
static_cast
<
int
>
(
prob
.
C
),
static_cast
<
int
>
(
prob
.
Hi
),
static_cast
<
int
>
(
prob
.
Wi
)};
ck
::
Array
<
ck
::
index_t
,
5
>
out_lengths
{
static_cast
<
int
>
(
prob
.
G
),
static_cast
<
int
>
(
prob
.
N
),
static_cast
<
int
>
(
prob
.
K
),
static_cast
<
int
>
(
prob
.
Ho
),
static_cast
<
int
>
(
prob
.
Wo
)};
ck
::
Array
<
ck
::
index_t
,
5
>
wei_lengths
{
static_cast
<
int
>
(
prob
.
G
),
static_cast
<
int
>
(
prob
.
K
),
static_cast
<
int
>
(
prob
.
C
),
static_cast
<
int
>
(
prob
.
Y
),
static_cast
<
int
>
(
prob
.
X
)};
ck
::
Array
<
ck
::
index_t
,
5
>
d_lengths
=
{};
ck
::
Array
<
ck
::
index_t
,
5
>
in_strides
{
static_cast
<
int
>
(
prob
.
C
),
static_cast
<
int
>
(
prob
.
Hi
*
prob
.
Wi
*
prob
.
G
*
prob
.
C
),
1
,
static_cast
<
int
>
(
prob
.
Wi
*
prob
.
G
*
prob
.
C
),
static_cast
<
int
>
(
prob
.
G
*
prob
.
C
)};
ck
::
Array
<
ck
::
index_t
,
5
>
out_strides
{
static_cast
<
int
>
(
prob
.
K
),
static_cast
<
int
>
(
prob
.
Ho
*
prob
.
Wo
*
prob
.
G
*
prob
.
K
),
1
,
static_cast
<
int
>
(
prob
.
Wo
*
prob
.
G
*
prob
.
K
),
static_cast
<
int
>
(
prob
.
G
*
prob
.
K
)};
ck
::
Array
<
ck
::
index_t
,
5
>
wei_strides
{
static_cast
<
int
>
(
prob
.
K
*
prob
.
Y
*
prob
.
X
*
prob
.
C
),
static_cast
<
int
>
(
prob
.
Y
*
prob
.
X
*
prob
.
C
),
1
,
static_cast
<
int
>
(
prob
.
X
*
prob
.
C
),
static_cast
<
int
>
(
prob
.
C
)};
ck
::
Array
<
ck
::
index_t
,
5
>
d_strides
=
{};
ck
::
Array
<
ck
::
index_t
,
2
>
conv_filter_strides
=
{
2
,
2
};
ck
::
Array
<
ck
::
index_t
,
2
>
conv_filter_dilations
=
{
1
,
1
};
ck
::
Array
<
ck
::
index_t
,
2
>
input_left_pads
=
{
1
,
1
};
ck
::
Array
<
ck
::
index_t
,
2
>
input_right_pads
=
{
1
,
1
};
// move the data onto the device
auto
in_dev
=
to_gpu
(
generate_buffer
<
ck
::
half_t
,
ck
::
Array
<
ck
::
index_t
,
5
>>
(
in_lengths
,
in_strides
,
0
));
auto
wei_dev
=
to_gpu
(
generate_buffer
<
ck
::
half_t
,
ck
::
Array
<
ck
::
index_t
,
5
>>
(
wei_lengths
,
wei_strides
,
1
));
auto
out_dev
=
to_gpu
(
generate_buffer
<
ck
::
half_t
,
ck
::
Array
<
ck
::
index_t
,
5
>>
(
out_lengths
,
out_strides
,
2
));
// CK Verficiation: Reference Kernel
/**bool pass = true;
Tensor<ck::half_t> in_host(in_lengths, in_strides);
in_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
Tensor<ck::half_t> wei_host(wei_lengths, wei_strides);
wei_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
Tensor<ck::half_t> out_host(out_lengths, out_strides);
std::vector<ck::index_t> conv_filter_strides_ = {2, 2};
std::vector<ck::index_t> conv_filter_dilations_ = {1, 1};
std::vector<ck::index_t> input_left_pads_ = {1, 1};
std::vector<ck::index_t> input_right_pads_ = {1, 1};
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<
2,
ck::half_t,
ck::half_t,
ck::half_t,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
Epilogue>();
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(in_host,
wei_host,
out_host,
conv_filter_strides_,
conv_filter_dilations_,
input_left_pads_,
input_right_pads_,
ck::tensor_operation::element_wise::PassThrough{},
ck::tensor_operation::element_wise::PassThrough{},
Epilogue{1.0f, 1.0f});
out_host.SetZero();
ref_invoker.Run(ref_argument);**/
for
(
auto
solution
:
prob
.
GetSolutions
(
"gfx908"
,
prologue
,
epilogue
))
{
// substitute instance values into the template
auto
src
=
ck
::
host
::
InterpolateString
(
conv_compile_check
,
{{
"include"
,
prob
.
GetIncludeHeader
()},
{
"template"
,
solution
.
ToTemplateString
()}});
auto
srcs
=
get_headers_for_test
();
srcs
.
push_back
({
"main.cpp"
,
src
});
rtc
::
compile_options
options
;
auto
name
=
solution
.
GetTemplateParameter
<
std
::
string
>
(
"name"
);
options
.
kernel_name
=
"run_"
+
name
;
auto
k
=
rtc
::
compile_kernel
(
srcs
,
options
);
// Grid size calculation
auto
block_size
=
solution
.
GetTemplateParameter
<
ck
::
index_t
>
(
"BlockSize"
);
auto
tmp
=
get_launch_params
(
solution
,
out_lengths
,
out_strides
);
auto
grid_size
=
tmp
*
in_lengths
[
1
];
// launch the kernel with arguments needed for the argument pointer
k
.
launch
(
nullptr
,
grid_size
*
block_size
,
block_size
)(
in_dev
.
data
(),
wei_dev
.
data
(),
out_dev
.
data
(),
in_lengths
,
in_strides
,
wei_lengths
,
wei_strides
,
out_lengths
,
out_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
);
// auto res = rtc::from_gpu(out_dev);
// pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
// assert(pass);
// Simple check: this checks that the output from each instance matches the output from the
// first instance
CHECK
(
report
(
solution
,
check
(
rtc
::
from_gpu
(
out_dev
))));
}
}
int
main
(
int
argc
,
const
char
*
argv
[])
{
test
::
run
(
argc
,
argv
);
}
codegen/test/grouped_conv_fwd_multiple_d_v2.cpp
0 → 100644
View file @
e6bb1dd7
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp"
#include "ck/host/headers.hpp"
#include "ck/host/stringutils.hpp"
#include "ck/host/utils.hpp"
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/helper.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include <test.hpp>
#include <rtc/compile_kernel.hpp>
#include <rtc/hip.hpp>
#include <fstream>
// need this for validation
/**struct Epilogue
{
Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
template <typename E, typename D>
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
template <>
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
const ck::half_t& d) const
{
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
}
float alpha_;
float beta_;
};**/
const
std
::
string
conv_compile_check
=
R"__ck__(
#include <${include}>
${template};
)__ck__"
;
TEST_CASE
(
test_problem_kernel
)
{
// set up problem specification
ck
::
host
::
conv
::
Problem_Conv_Fwd
prob
;
prob
.
NumDim
=
2
;
prob
.
G
=
32
;
prob
.
N
=
256
;
prob
.
C
=
32
;
prob
.
K
=
64
;
prob
.
Y
=
3
;
prob
.
X
=
3
;
prob
.
Hi
=
28
;
prob
.
Wi
=
28
;
prob
.
Ho
=
28
;
prob
.
Wo
=
28
;
check_all
<
ck
::
half_t
>
check
;
// user provided fusion operations
std
::
string
epilogue
=
R"(
struct Epilogue
{
__host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
template <typename E, typename D>
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
template <>
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
const ck::half_t& d) const
{
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
}
float alpha_;
float beta_;
};
)"
;
std
::
string
prologue
=
""
;
// length+stride arrays
ck
::
Array
<
ck
::
index_t
,
5
>
in_lengths
{
static_cast
<
int
>
(
prob
.
G
),
static_cast
<
int
>
(
prob
.
N
),
static_cast
<
int
>
(
prob
.
C
),
static_cast
<
int
>
(
prob
.
Hi
),
static_cast
<
int
>
(
prob
.
Wi
)};
ck
::
Array
<
ck
::
index_t
,
5
>
out_lengths
{
static_cast
<
int
>
(
prob
.
G
),
static_cast
<
int
>
(
prob
.
N
),
static_cast
<
int
>
(
prob
.
K
),
static_cast
<
int
>
(
prob
.
Ho
),
static_cast
<
int
>
(
prob
.
Wo
)};
ck
::
Array
<
ck
::
index_t
,
5
>
wei_lengths
{
static_cast
<
int
>
(
prob
.
G
),
static_cast
<
int
>
(
prob
.
K
),
static_cast
<
int
>
(
prob
.
C
),
static_cast
<
int
>
(
prob
.
Y
),
static_cast
<
int
>
(
prob
.
X
)};
ck
::
Array
<
ck
::
index_t
,
5
>
d_lengths
=
{};
ck
::
Array
<
ck
::
index_t
,
5
>
in_strides
{
static_cast
<
int
>
(
prob
.
C
),
static_cast
<
int
>
(
prob
.
Hi
*
prob
.
Wi
*
prob
.
G
*
prob
.
C
),
1
,
static_cast
<
int
>
(
prob
.
Wi
*
prob
.
G
*
prob
.
C
),
static_cast
<
int
>
(
prob
.
G
*
prob
.
C
)};
ck
::
Array
<
ck
::
index_t
,
5
>
out_strides
{
static_cast
<
int
>
(
prob
.
K
),
static_cast
<
int
>
(
prob
.
Ho
*
prob
.
Wo
*
prob
.
G
*
prob
.
K
),
1
,
static_cast
<
int
>
(
prob
.
Wo
*
prob
.
G
*
prob
.
K
),
static_cast
<
int
>
(
prob
.
G
*
prob
.
K
)};
ck
::
Array
<
ck
::
index_t
,
5
>
wei_strides
{
static_cast
<
int
>
(
prob
.
K
*
prob
.
Y
*
prob
.
X
*
prob
.
C
),
static_cast
<
int
>
(
prob
.
Y
*
prob
.
X
*
prob
.
C
),
1
,
static_cast
<
int
>
(
prob
.
X
*
prob
.
C
),
static_cast
<
int
>
(
prob
.
C
)};
ck
::
Array
<
ck
::
index_t
,
5
>
d_strides
=
{};
ck
::
Array
<
ck
::
index_t
,
2
>
conv_filter_strides
=
{
1
,
1
};
ck
::
Array
<
ck
::
index_t
,
2
>
conv_filter_dilations
=
{
1
,
1
};
ck
::
Array
<
ck
::
index_t
,
2
>
input_left_pads
=
{
0
,
0
};
ck
::
Array
<
ck
::
index_t
,
2
>
input_right_pads
=
{
0
,
0
};
// move the data onto the device
auto
in_dev
=
to_gpu
(
generate_buffer
<
ck
::
half_t
,
ck
::
Array
<
ck
::
index_t
,
5
>>
(
in_lengths
,
in_strides
,
0
));
auto
wei_dev
=
to_gpu
(
generate_buffer
<
ck
::
half_t
,
ck
::
Array
<
ck
::
index_t
,
5
>>
(
wei_lengths
,
wei_strides
,
1
));
auto
out_dev
=
to_gpu
(
generate_buffer
<
ck
::
half_t
,
ck
::
Array
<
ck
::
index_t
,
5
>>
(
out_lengths
,
out_strides
,
2
));
// CK Verficiation: Reference Kernel
/**bool pass = true;
Tensor<ck::half_t> in_host(in_lengths, in_strides);
in_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
Tensor<ck::half_t> wei_host(wei_lengths, wei_strides);
wei_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
Tensor<ck::half_t> out_host(out_lengths, out_strides);
std::vector<ck::index_t> conv_filter_strides_ = {1, 1};
std::vector<ck::index_t> conv_filter_dilations_ = {1, 1};
std::vector<ck::index_t> input_left_pads_ = {0, 0};
std::vector<ck::index_t> input_right_pads_ = {0, 0};
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<
2,
ck::half_t,
ck::half_t,
ck::half_t,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
Epilogue>();
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(in_host,
wei_host,
out_host,
conv_filter_strides_,
conv_filter_dilations_,
input_left_pads_,
input_right_pads_,
ck::tensor_operation::element_wise::PassThrough{},
ck::tensor_operation::element_wise::PassThrough{},
Epilogue{1.0f, 1.0f});
out_host.SetZero();
ref_invoker.Run(ref_argument);**/
for
(
auto
solution
:
prob
.
GetSolutions
(
"gfx908"
,
prologue
,
epilogue
))
{
// substitute instance values into the template
auto
src
=
ck
::
host
::
InterpolateString
(
conv_compile_check
,
{{
"include"
,
prob
.
GetIncludeHeader
()},
{
"template"
,
solution
.
ToTemplateString
()}});
auto
srcs
=
get_headers_for_test
();
srcs
.
push_back
({
"main.cpp"
,
src
});
rtc
::
compile_options
options
;
auto
name
=
solution
.
GetTemplateParameter
<
std
::
string
>
(
"name"
);
options
.
kernel_name
=
"run_"
+
name
;
auto
k
=
rtc
::
compile_kernel
(
srcs
,
options
);
// Grid size calculation
auto
block_size
=
solution
.
GetTemplateParameter
<
ck
::
index_t
>
(
"BlockSize"
);
auto
tmp
=
get_launch_params
(
solution
,
out_lengths
,
out_strides
);
auto
grid_size
=
tmp
*
in_lengths
[
1
];
// launch the kernel with arguments needed for the argument pointer
k
.
launch
(
nullptr
,
grid_size
*
block_size
,
block_size
)(
in_dev
.
data
(),
wei_dev
.
data
(),
out_dev
.
data
(),
in_lengths
,
in_strides
,
wei_lengths
,
wei_strides
,
out_lengths
,
out_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
);
// auto res = rtc::from_gpu(out_dev);
// pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
// assert(pass);
// Simple check: this checks that the output from each instance matches the output from the
// first instance
CHECK
(
report
(
solution
,
check
(
rtc
::
from_gpu
(
out_dev
))));
}
}
int
main
(
int
argc
,
const
char
*
argv
[])
{
test
::
run
(
argc
,
argv
);
}
codegen/test/grouped_conv_fwd_multiple_d_v3.cpp
0 → 100644
View file @
e6bb1dd7
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp"
#include "ck/host/headers.hpp"
#include "ck/host/stringutils.hpp"
#include "ck/host/utils.hpp"
#include "ck/tensor_operation/gpu/device/helper.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "common.hpp"
#include <test.hpp>
#include <rtc/compile_kernel.hpp>
#include <rtc/hip.hpp>
#include <fstream>
// need this for verification
/**struct Epilogue
{
Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
template <typename E, typename D>
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
template <>
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
const ck::half_t& d) const
{
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
}
float alpha_;
float beta_;
};**/
const
std
::
string
conv_compile_check
=
R"__ck__(
#include <${include}>
${template};
)__ck__"
;
TEST_CASE
(
test_problem_kernel
)
{
// set up problem specification
ck
::
host
::
conv
::
Problem_Conv_Fwd
prob
;
prob
.
NumDim
=
2
;
prob
.
G
=
32
;
prob
.
N
=
256
;
prob
.
C
=
32
;
prob
.
K
=
64
;
prob
.
Y
=
3
;
prob
.
X
=
3
;
prob
.
Hi
=
28
;
prob
.
Wi
=
28
;
prob
.
Ho
=
28
;
prob
.
Wo
=
28
;
check_all
<
ck
::
half_t
>
check
;
// user provided fusion operations
std
::
string
epilogue
=
R"(
struct Epilogue
{
__host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
template <typename E, typename D>
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
template <>
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
const ck::half_t& d) const
{
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
}
float alpha_;
float beta_;
};
)"
;
std
::
string
prologue
=
""
;
// length+stride arrays
ck
::
Array
<
ck
::
index_t
,
5
>
in_lengths
{
static_cast
<
int
>
(
prob
.
G
),
static_cast
<
int
>
(
prob
.
N
),
static_cast
<
int
>
(
prob
.
C
),
static_cast
<
int
>
(
prob
.
Hi
),
static_cast
<
int
>
(
prob
.
Wi
)};
ck
::
Array
<
ck
::
index_t
,
5
>
out_lengths
{
static_cast
<
int
>
(
prob
.
G
),
static_cast
<
int
>
(
prob
.
N
),
static_cast
<
int
>
(
prob
.
K
),
static_cast
<
int
>
(
prob
.
Ho
),
static_cast
<
int
>
(
prob
.
Wo
)};
ck
::
Array
<
ck
::
index_t
,
5
>
wei_lengths
{
static_cast
<
int
>
(
prob
.
G
),
static_cast
<
int
>
(
prob
.
K
),
static_cast
<
int
>
(
prob
.
C
),
static_cast
<
int
>
(
prob
.
Y
),
static_cast
<
int
>
(
prob
.
X
)};
ck
::
Array
<
ck
::
index_t
,
5
>
d_lengths
=
{};
ck
::
Array
<
ck
::
index_t
,
5
>
in_strides
{
static_cast
<
int
>
(
prob
.
C
),
static_cast
<
int
>
(
prob
.
Hi
*
prob
.
Wi
*
prob
.
G
*
prob
.
C
),
1
,
static_cast
<
int
>
(
prob
.
Wi
*
prob
.
G
*
prob
.
C
),
static_cast
<
int
>
(
prob
.
G
*
prob
.
C
)};
ck
::
Array
<
ck
::
index_t
,
5
>
out_strides
{
static_cast
<
int
>
(
prob
.
K
),
static_cast
<
int
>
(
prob
.
Ho
*
prob
.
Wo
*
prob
.
G
*
prob
.
K
),
1
,
static_cast
<
int
>
(
prob
.
Wo
*
prob
.
G
*
prob
.
K
),
static_cast
<
int
>
(
prob
.
G
*
prob
.
K
)};
ck
::
Array
<
ck
::
index_t
,
5
>
wei_strides
{
static_cast
<
int
>
(
prob
.
K
*
prob
.
Y
*
prob
.
X
*
prob
.
C
),
static_cast
<
int
>
(
prob
.
Y
*
prob
.
X
*
prob
.
C
),
1
,
static_cast
<
int
>
(
prob
.
X
*
prob
.
C
),
static_cast
<
int
>
(
prob
.
C
)};
ck
::
Array
<
ck
::
index_t
,
5
>
d_strides
=
{};
ck
::
Array
<
ck
::
index_t
,
2
>
conv_filter_strides
=
{
2
,
2
};
ck
::
Array
<
ck
::
index_t
,
2
>
conv_filter_dilations
=
{
1
,
1
};
ck
::
Array
<
ck
::
index_t
,
2
>
input_left_pads
=
{
0
,
0
};
ck
::
Array
<
ck
::
index_t
,
2
>
input_right_pads
=
{
0
,
0
};
// move the data onto the device
auto
in_dev
=
to_gpu
(
generate_buffer
<
ck
::
half_t
,
ck
::
Array
<
ck
::
index_t
,
5
>>
(
in_lengths
,
in_strides
,
0
));
auto
wei_dev
=
to_gpu
(
generate_buffer
<
ck
::
half_t
,
ck
::
Array
<
ck
::
index_t
,
5
>>
(
wei_lengths
,
wei_strides
,
1
));
auto
out_dev
=
to_gpu
(
generate_buffer
<
ck
::
half_t
,
ck
::
Array
<
ck
::
index_t
,
5
>>
(
out_lengths
,
out_strides
,
2
));
// CK Verficiation: Reference Kernel
/**bool pass = true;
Tensor<ck::half_t> in_host(in_lengths, in_strides);
in_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
Tensor<ck::half_t> wei_host(wei_lengths, wei_strides);
wei_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
Tensor<ck::half_t> out_host(out_lengths, out_strides);
std::vector<ck::index_t> conv_filter_strides_ = {2, 2};
std::vector<ck::index_t> conv_filter_dilations_ = {1, 1};
std::vector<ck::index_t> input_left_pads_ = {0, 0};
std::vector<ck::index_t> input_right_pads_ = {0, 0};
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<
2,
ck::half_t,
ck::half_t,
ck::half_t,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
Epilogue>();
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(in_host,
wei_host,
out_host,
conv_filter_strides_,
conv_filter_dilations_,
input_left_pads_,
input_right_pads_,
ck::tensor_operation::element_wise::PassThrough{},
ck::tensor_operation::element_wise::PassThrough{},
Epilogue{1.0f, 1.0f});
out_host.SetZero();
ref_invoker.Run(ref_argument);**/
for
(
auto
solution
:
prob
.
GetSolutions
(
"gfx908"
,
prologue
,
epilogue
))
{
// substitute instance values into the template
auto
src
=
ck
::
host
::
InterpolateString
(
conv_compile_check
,
{{
"include"
,
prob
.
GetIncludeHeader
()},
{
"template"
,
solution
.
ToTemplateString
()}});
auto
srcs
=
get_headers_for_test
();
srcs
.
push_back
({
"main.cpp"
,
src
});
rtc
::
compile_options
options
;
auto
name
=
solution
.
GetTemplateParameter
<
std
::
string
>
(
"name"
);
options
.
kernel_name
=
"run_"
+
name
;
auto
k
=
rtc
::
compile_kernel
(
srcs
,
options
);
// Grid size calculation
auto
block_size
=
solution
.
GetTemplateParameter
<
ck
::
index_t
>
(
"BlockSize"
);
auto
tmp
=
get_launch_params
(
solution
,
out_lengths
,
out_strides
);
auto
grid_size
=
tmp
*
in_lengths
[
1
];
// launch the kernel with arguments needed for the argument pointer
k
.
launch
(
nullptr
,
grid_size
*
block_size
,
block_size
)(
in_dev
.
data
(),
wei_dev
.
data
(),
out_dev
.
data
(),
in_lengths
,
in_strides
,
wei_lengths
,
wei_strides
,
out_lengths
,
out_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
);
// auto res = rtc::from_gpu(out_dev);
// pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
// assert(pass);
// Simple check: this checks that the output from each instance matches the output from the
// first instance
CHECK
(
report
(
solution
,
check
(
rtc
::
from_gpu
(
out_dev
))));
}
}
int
main
(
int
argc
,
const
char
*
argv
[])
{
test
::
run
(
argc
,
argv
);
}
Prev
1
2
3
4
5
6
7
…
27
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment