Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
6bc9ee05
Unverified
Commit
6bc9ee05
authored
Sep 13, 2023
by
Chao Liu
Committed by
GitHub
Sep 13, 2023
Browse files
Remove program server (#10)
* removing program server * specify launch bound per kernel instance
parent
f3baea0d
Changes
28
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
245 additions
and
648 deletions
+245
-648
example/91_tile_program/CMakeLists.txt
example/91_tile_program/CMakeLists.txt
+0
-1
example/91_tile_program/gemm.cpp
example/91_tile_program/gemm.cpp
+18
-18
example/91_tile_program/gemm.hpp
example/91_tile_program/gemm.hpp
+15
-21
example/91_tile_program/gemm_gemm.cpp
example/91_tile_program/gemm_gemm.cpp
+29
-28
example/91_tile_program/gemm_gemm.hpp
example/91_tile_program/gemm_gemm.hpp
+23
-25
example/91_tile_program/gemm_softmax_gemm.cpp
example/91_tile_program/gemm_softmax_gemm.cpp
+31
-34
example/91_tile_program/gemm_softmax_gemm.hpp
example/91_tile_program/gemm_softmax_gemm.hpp
+25
-30
example/91_tile_program/hello_world.cpp
example/91_tile_program/hello_world.cpp
+0
-62
example/91_tile_program/im2col.cpp
example/91_tile_program/im2col.cpp
+28
-33
example/91_tile_program/reduce.cpp
example/91_tile_program/reduce.cpp
+12
-12
example/91_tile_program/reduce.hpp
example/91_tile_program/reduce.hpp
+6
-7
example/91_tile_program/softmax.cpp
example/91_tile_program/softmax.cpp
+10
-13
example/91_tile_program/softmax.hpp
example/91_tile_program/softmax.hpp
+6
-7
example/91_tile_program/tile_program.hpp
example/91_tile_program/tile_program.hpp
+0
-189
include/ck/host_utility/kernel_launch.hpp
include/ck/host_utility/kernel_launch.hpp
+26
-0
include/ck/tile_program/block_tile/block_gemm_areg_bsmem_creg_v1.hpp
...tile_program/block_tile/block_gemm_areg_bsmem_creg_v1.hpp
+0
-57
include/ck/tile_program/block_tile/block_gemm_asmem_bsmem_creg_v1.hpp
...ile_program/block_tile/block_gemm_asmem_bsmem_creg_v1.hpp
+0
-49
include/ck/tile_program/block_tile/block_reduce.hpp
include/ck/tile_program/block_tile/block_reduce.hpp
+0
-46
include/ck/tile_program/block_tile_pipeline/block_gemm_pipeline_agmem_bgmem_creg_v1.hpp
...tile_pipeline/block_gemm_pipeline_agmem_bgmem_creg_v1.hpp
+7
-7
include/ck/tile_program/block_tile_pipeline/block_gemm_pipeline_agmem_bgmem_creg_v2.hpp
...tile_pipeline/block_gemm_pipeline_agmem_bgmem_creg_v2.hpp
+9
-9
No files found.
example/91_tile_program/CMakeLists.txt
View file @
6bc9ee05
add_example_executable
(
example_hello_world hello_world.cpp
)
add_example_executable
(
example_im2col im2col.cpp
)
add_example_executable
(
example_im2col im2col.cpp
)
add_example_executable
(
example_gemm gemm.cpp
)
add_example_executable
(
example_gemm gemm.cpp
)
add_example_executable
(
example_gemm_gemm gemm_gemm.cpp
)
add_example_executable
(
example_gemm_gemm gemm_gemm.cpp
)
...
...
example/91_tile_program/gemm.cpp
View file @
6bc9ee05
...
@@ -4,9 +4,8 @@
...
@@ -4,9 +4,8 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor_view.hpp"
#include "ck/tensor/tensor_view.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
...
@@ -115,22 +114,23 @@ int main(int argc, char* argv[])
...
@@ -115,22 +114,23 @@ int main(int argc, char* argv[])
kGemmNPerBlock
,
kGemmNPerBlock
,
kGemmKPerBlock
>
{};
kGemmKPerBlock
>
{};
float
ave_time
=
launch
(
ProgramServer
{},
float
ave_time
=
launch_kernel
<
kBlockSize
,
2
>
(
StreamConfig
{
nullptr
,
true
},
gemm_kernel
,
gemm_kernel
,
kGridSize
,
kGridSize
,
kBlockSize
,
kBlockSize
,
static_cast
<
ADataType
*>
(
a_buf
.
GetDeviceBuffer
()),
0
,
static_cast
<
BDataType
*>
(
b_buf
.
GetDeviceBuffer
()),
static_cast
<
ADataType
*>
(
a_buf
.
GetDeviceBuffer
()),
static_cast
<
CDataType
*>
(
c_buf
.
GetDeviceBuffer
()),
static_cast
<
BDataType
*>
(
b_buf
.
GetDeviceBuffer
()),
M
,
static_cast
<
CDataType
*>
(
c_buf
.
GetDeviceBuffer
()),
N
,
M
,
K
,
N
,
K
,
K
,
K
,
K
,
N
,
K
,
AElementFunction
{},
N
,
BElementFunction
{},
AElementFunction
{},
CElementFunction
{});
BElementFunction
{},
CElementFunction
{});
c_buf
.
FromDevice
(
c_host_dev
.
mData
.
data
());
c_buf
.
FromDevice
(
c_host_dev
.
mData
.
data
());
...
...
example/91_tile_program/gemm.hpp
View file @
6bc9ee05
...
@@ -7,8 +7,8 @@
...
@@ -7,8 +7,8 @@
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "tile_program.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_elementwise.hpp"
#include "ck/tile_program/tile/tile_elementwise.hpp"
#include "ck/tile_program/tile/tile_gemm_shape.hpp"
#include "ck/tile_program/tile/tile_gemm_shape.hpp"
...
@@ -59,19 +59,18 @@ struct Gemm
...
@@ -59,19 +59,18 @@ struct Gemm
using
GridGemm
=
ck
::
tile_program
::
grid
::
GridGemm
<
Problem
,
Policy
>
;
using
GridGemm
=
ck
::
tile_program
::
grid
::
GridGemm
<
Problem
,
Policy
>
;
__host__
__device__
void
operator
()(
ProgramServer
&
ps
,
__device__
void
operator
()(
const
ADataType
*
p_a
,
const
ADataType
*
p_a
,
const
BDataType
*
p_b
,
const
BDataType
*
p_b
,
CDataType
*
p_c
,
CDataType
*
p_c
,
ck
::
index_t
M
,
ck
::
index_t
M
,
ck
::
index_t
N
,
ck
::
index_t
N
,
ck
::
index_t
K
,
ck
::
index_t
K
,
ck
::
index_t
Lda
,
ck
::
index_t
Lda
,
ck
::
index_t
Ldb
,
ck
::
index_t
Ldb
,
ck
::
index_t
Ldc
,
ck
::
index_t
Ldc
,
const
AElementFunction
&
a_element_func
,
const
AElementFunction
&
a_element_func
,
const
BElementFunction
&
b_element_func
,
const
BElementFunction
&
b_element_func
,
const
CElementFunction
&
c_element_func
)
const
const
CElementFunction
&
c_element_func
)
const
{
{
using
namespace
ck
;
using
namespace
ck
;
using
namespace
ck
::
tile_program
;
using
namespace
ck
::
tile_program
;
...
@@ -87,12 +86,7 @@ struct Gemm
...
@@ -87,12 +86,7 @@ struct Gemm
auto
c_dram_grid
=
make_naive_tensor_view
<
AddressSpaceEnum
::
Global
>
(
auto
c_dram_grid
=
make_naive_tensor_view
<
AddressSpaceEnum
::
Global
>
(
p_c
,
make_tuple
(
M
,
N
),
make_tuple
(
Ldc
,
1
),
Number
<
32
>
{},
Number
<
1
>
{});
p_c
,
make_tuple
(
M
,
N
),
make_tuple
(
Ldc
,
1
),
Number
<
32
>
{},
Number
<
1
>
{});
GridGemm
{}(
ps
,
GridGemm
{}(
a_dram_grid
,
a_dram_grid
,
b_dram_grid
,
c_dram_grid
,
a_element_func
,
b_element_func
,
c_element_func
);
b_dram_grid
,
c_dram_grid
,
a_element_func
,
b_element_func
,
c_element_func
);
}
}
};
};
example/91_tile_program/gemm_gemm.cpp
View file @
6bc9ee05
...
@@ -4,9 +4,8 @@
...
@@ -4,9 +4,8 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor_view.hpp"
#include "ck/tensor/tensor_view.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
...
@@ -90,33 +89,35 @@ int main(int argc, char* argv[])
...
@@ -90,33 +89,35 @@ int main(int argc, char* argv[])
std
::
cout
<<
"grid size "
<<
kGridSize
<<
std
::
endl
;
std
::
cout
<<
"grid size "
<<
kGridSize
<<
std
::
endl
;
float
ave_time
=
launch
(
ProgramServer
{},
float
ave_time
=
GemmGemm
<
A0DataType
,
launch_kernel
<
kBlockSize
,
2
>
(
StreamConfig
{
nullptr
,
true
},
B0DataType
,
GemmGemm
<
A0DataType
,
Acc0DataType
,
B0DataType
,
C0DataType
,
Acc0DataType
,
B1DataType
,
C0DataType
,
Acc1DataType
,
B1DataType
,
C1DataType
,
Acc1DataType
,
C1DataType
,
kBlockSize
,
kM0PerBlock
,
kN0PerBlock
,
kK0PerBlock
,
kN1PerBlock
>
{},
kGridSize
,
kBlockSize
,
kBlockSize
,
kM0PerBlock
,
0
,
kN0PerBlock
,
static_cast
<
A0DataType
*>
(
a0_buf
.
GetDeviceBuffer
()),
kK0PerBlock
,
static_cast
<
B0DataType
*>
(
b0_buf
.
GetDeviceBuffer
()),
kN1PerBlock
>
{},
static_cast
<
B1DataType
*>
(
b1_buf
.
GetDeviceBuffer
()),
kGridSize
,
static_cast
<
C1DataType
*>
(
c1_buf
.
GetDeviceBuffer
()),
kBlockSize
,
M0
,
static_cast
<
A0DataType
*>
(
a0_buf
.
GetDeviceBuffer
()),
N0
,
static_cast
<
B0DataType
*>
(
b0_buf
.
GetDeviceBuffer
()),
K0
,
static_cast
<
B1DataType
*>
(
b1_buf
.
GetDeviceBuffer
()),
N1
,
static_cast
<
C1DataType
*>
(
c1_buf
.
GetDeviceBuffer
()),
K0
,
// Lda0
M0
,
K0
,
// Ldb0
N0
,
N0
,
// Ldb1
K0
,
N1
);
// Ldc1
N1
,
K0
,
// Lda0
K0
,
// Ldb0
N0
,
// Ldb1
N1
);
// Ldc1
c1_buf
.
FromDevice
(
c1_host_dev
.
mData
.
data
());
c1_buf
.
FromDevice
(
c1_host_dev
.
mData
.
data
());
...
...
example/91_tile_program/gemm_gemm.hpp
View file @
6bc9ee05
...
@@ -8,7 +8,6 @@
...
@@ -8,7 +8,6 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "tile_program.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_elementwise.hpp"
#include "ck/tile_program/tile/tile_elementwise.hpp"
#include "ck/tile_program/tile/tile_gemm_shape.hpp"
#include "ck/tile_program/tile/tile_gemm_shape.hpp"
...
@@ -54,7 +53,7 @@ struct GemmGemm
...
@@ -54,7 +53,7 @@ struct GemmGemm
#if 0
#if 0
// 2d
// 2d
__host__
__device__ static constexpr auto MakeB1LdsBlockDescriptor()
__device__ static constexpr auto MakeB1LdsBlockDescriptor()
{
{
using namespace ck;
using namespace ck;
...
@@ -68,7 +67,7 @@ struct GemmGemm
...
@@ -68,7 +67,7 @@ struct GemmGemm
}
}
#else
#else
// fake XOR
// fake XOR
__host__
__device__
static
constexpr
auto
MakeB1LdsBlockDescriptor
()
__device__
static
constexpr
auto
MakeB1LdsBlockDescriptor
()
{
{
using
namespace
ck
;
using
namespace
ck
;
...
@@ -100,7 +99,7 @@ struct GemmGemm
...
@@ -100,7 +99,7 @@ struct GemmGemm
}
}
#endif
#endif
__host__
__device__
static
constexpr
auto
MakeB1DramTileDistribution
()
__device__
static
constexpr
auto
MakeB1DramTileDistribution
()
{
{
using
namespace
ck
;
using
namespace
ck
;
using
namespace
ck
::
tile_program
;
using
namespace
ck
::
tile_program
;
...
@@ -125,7 +124,7 @@ struct GemmGemm
...
@@ -125,7 +124,7 @@ struct GemmGemm
Sequence
<
0
,
1
>>
{});
Sequence
<
0
,
1
>>
{});
}
}
__host__
__device__
static
constexpr
ck
::
index_t
GetStaticLdsSize
()
__device__
static
constexpr
ck
::
index_t
GetStaticLdsSize
()
{
{
using
namespace
ck
;
using
namespace
ck
;
...
@@ -134,19 +133,18 @@ struct GemmGemm
...
@@ -134,19 +133,18 @@ struct GemmGemm
sizeof
(
B1DataType
)));
sizeof
(
B1DataType
)));
}
}
__host__
__device__
void
operator
()(
ProgramServer
&
ps
,
__device__
void
operator
()(
const
A0DataType
*
p_a0
,
const
A0DataType
*
p_a0
,
const
B0DataType
*
p_b0
,
const
B0DataType
*
p_b0
,
const
B1DataType
*
p_b1
,
const
B1DataType
*
p_b1
,
C1DataType
*
p_c1
,
C1DataType
*
p_c1
,
ck
::
index_t
M0
,
ck
::
index_t
M0
,
ck
::
index_t
N0
,
ck
::
index_t
N0
,
ck
::
index_t
K0
,
ck
::
index_t
K0
,
ck
::
index_t
N1
,
ck
::
index_t
N1
,
ck
::
index_t
Lda0
,
ck
::
index_t
Lda0
,
ck
::
index_t
Ldb0
,
ck
::
index_t
Ldb0
,
ck
::
index_t
Ldb1
,
ck
::
index_t
Ldb1
,
ck
::
index_t
Ldc1
)
ck
::
index_t
Ldc1
)
{
{
using
namespace
ck
;
using
namespace
ck
;
using
namespace
ck
::
tile_program
;
using
namespace
ck
::
tile_program
;
...
@@ -163,17 +161,17 @@ struct GemmGemm
...
@@ -163,17 +161,17 @@ struct GemmGemm
p_b1
,
make_tuple
(
N1
,
N0
),
make_tuple
(
Ldb1
,
1
),
Number
<
32
>
{},
Number
<
1
>
{});
p_b1
,
make_tuple
(
N1
,
N0
),
make_tuple
(
Ldb1
,
1
),
Number
<
32
>
{},
Number
<
1
>
{});
// divide problem
// divide problem
const
auto
id_block
=
ps
.
get_block_id
();
const
auto
id_block
=
get_block_id
();
const
auto
num_tile_m0
=
M0
/
kM0PerBlock
;
const
auto
num_tile_m0
=
M0
/
kM0PerBlock
;
const
auto
num_tile_n1
=
N1
/
kN1PerBlock
;
const
auto
num_tile_n1
=
N1
/
kN1PerBlock
;
const
auto
block2tile
=
ps
(
make_cluster_descriptor
(
make_tuple
(
num_tile_m0
,
num_tile_n1
))
)
;
const
auto
block2tile
=
make_cluster_descriptor
(
make_tuple
(
num_tile_m0
,
num_tile_n1
));
const
auto
id_tile
=
block2tile
.
CalculateBottomIndex
(
make_tuple
(
id_block
));
const
auto
id_tile
=
block2tile
.
CalculateBottomIndex
(
make_tuple
(
id_block
));
const
auto
iM0
=
ps
.
read
_
first
_
lane
(
id_tile
.
At
<
0
>
()
*
kM0PerBlock
);
const
auto
iM0
=
__builtin_amdgcn_
readfirstlane
(
id_tile
.
At
<
0
>
()
*
kM0PerBlock
);
const
auto
iN1
=
ps
.
read
_
first
_
lane
(
id_tile
.
At
<
1
>
()
*
kN1PerBlock
);
const
auto
iN1
=
__builtin_amdgcn_
readfirstlane
(
id_tile
.
At
<
1
>
()
*
kN1PerBlock
);
__shared__
char
p_smem_char
[
GetStaticLdsSize
()];
__shared__
char
p_smem_char
[
GetStaticLdsSize
()];
...
@@ -233,18 +231,18 @@ struct GemmGemm
...
@@ -233,18 +231,18 @@ struct GemmGemm
const
auto
b1_block_tile
=
load_tile
(
b1_dram_block_window
);
const
auto
b1_block_tile
=
load_tile
(
b1_dram_block_window
);
// wait for block gemm0 pipeline to finish
// wait for block gemm0 pipeline to finish
ps
.
block_sync_lds
();
block_sync_lds
();
store_tile
(
b1_lds_block_window
,
b1_block_tile
);
store_tile
(
b1_lds_block_window
,
b1_block_tile
);
// wait for store_tile to finish
// wait for store_tile to finish
ps
.
block_sync_lds
();
block_sync_lds
();
// acc1 += c0 * b1
// acc1 += c0 * b1
block_gemm1
(
acc1_block_tile
,
c0_block_tile
,
b1_lds_block_window
);
block_gemm1
(
acc1_block_tile
,
c0_block_tile
,
b1_lds_block_window
);
// wait for block gemm1 to finish
// wait for block gemm1 to finish
ps
.
block_sync_lds
();
block_sync_lds
();
}
}
// move tile windows
// move tile windows
...
...
example/91_tile_program/gemm_softmax_gemm.cpp
View file @
6bc9ee05
...
@@ -4,9 +4,8 @@
...
@@ -4,9 +4,8 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor_view.hpp"
#include "ck/tensor/tensor_view.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
...
@@ -73,14 +72,10 @@ int main(int argc, char* argv[])
...
@@ -73,14 +72,10 @@ int main(int argc, char* argv[])
ck
::
utils
::
FillUniformDistributionIntegerValue
<
A0DataType
>
{
-
3.
f
,
3.
f
}(
a0_host
);
ck
::
utils
::
FillUniformDistributionIntegerValue
<
A0DataType
>
{
-
3.
f
,
3.
f
}(
a0_host
);
ck
::
utils
::
FillUniformDistributionIntegerValue
<
B0DataType
>
{
-
3.
f
,
3.
f
}(
b0_host
);
ck
::
utils
::
FillUniformDistributionIntegerValue
<
B0DataType
>
{
-
3.
f
,
3.
f
}(
b0_host
);
ck
::
utils
::
FillUniformDistributionIntegerValue
<
B1DataType
>
{
-
3.
f
,
3.
f
}(
b1_host
);
ck
::
utils
::
FillUniformDistributionIntegerValue
<
B1DataType
>
{
-
3.
f
,
3.
f
}(
b1_host
);
#el
if 0
#el
se
ck
::
utils
::
FillUniformDistribution
<
A0DataType
>
{
-
3.
f
,
3.
f
}(
a0_host
);
ck
::
utils
::
FillUniformDistribution
<
A0DataType
>
{
-
3.
f
,
3.
f
}(
a0_host
);
ck
::
utils
::
FillUniformDistribution
<
B0DataType
>
{
-
3.
f
,
3.
f
}(
b0_host
);
ck
::
utils
::
FillUniformDistribution
<
B0DataType
>
{
-
3.
f
,
3.
f
}(
b0_host
);
ck
::
utils
::
FillUniformDistribution
<
B1DataType
>
{
-
3.
f
,
3.
f
}(
b1_host
);
ck
::
utils
::
FillUniformDistribution
<
B1DataType
>
{
-
3.
f
,
3.
f
}(
b1_host
);
#else
ck
::
utils
::
FillConstant
<
A0DataType
>
{
1.0
f
}(
a0_host
);
ck
::
utils
::
FillConstant
<
A0DataType
>
{
1.0
f
}(
b0_host
);
ck
::
utils
::
FillConstant
<
A0DataType
>
{
1.0
f
}(
b1_host
);
#endif
#endif
// reference
// reference
...
@@ -107,33 +102,35 @@ int main(int argc, char* argv[])
...
@@ -107,33 +102,35 @@ int main(int argc, char* argv[])
std
::
cout
<<
"grid size "
<<
kGridSize
<<
std
::
endl
;
std
::
cout
<<
"grid size "
<<
kGridSize
<<
std
::
endl
;
float
ave_time
=
launch
(
ProgramServer
{},
float
ave_time
=
GemmSoftmaxGemm
<
A0DataType
,
launch_kernel
<
kBlockSize
,
2
>
(
StreamConfig
{
nullptr
,
true
},
B0DataType
,
GemmSoftmaxGemm
<
A0DataType
,
Acc0DataType
,
B0DataType
,
C0DataType
,
Acc0DataType
,
B1DataType
,
C0DataType
,
Acc1DataType
,
B1DataType
,
C1DataType
,
Acc1DataType
,
kBlockSize
,
C1DataType
,
kM0PerBlock
,
kBlockSize
,
kN0PerBlock
,
kM0PerBlock
,
kK0PerBlock
,
kN0PerBlock
,
kN1PerBlock
>
{},
kK0PerBlock
,
kGridSize
,
kN1PerBlock
>
{},
kBlockSize
,
kGridSize
,
static_cast
<
A0DataType
*>
(
a0_buf
.
GetDeviceBuffer
()),
kBlockSize
,
static_cast
<
B0DataType
*>
(
b0_buf
.
GetDeviceBuffer
()),
0
,
static_cast
<
B1DataType
*>
(
b1_buf
.
GetDeviceBuffer
()),
static_cast
<
A0DataType
*>
(
a0_buf
.
GetDeviceBuffer
()),
static_cast
<
C1DataType
*>
(
c1_buf
.
GetDeviceBuffer
()),
static_cast
<
B0DataType
*>
(
b0_buf
.
GetDeviceBuffer
()),
M0
,
static_cast
<
B1DataType
*>
(
b1_buf
.
GetDeviceBuffer
()),
N0
,
static_cast
<
C1DataType
*>
(
c1_buf
.
GetDeviceBuffer
()),
K0
,
M0
,
N1
,
N0
,
K0
,
// Lda0
K0
,
K0
,
// Ldb0
N1
,
N0
,
// Ldb1
K0
,
// Lda0
N1
);
// Ldc1
K0
,
// Ldb0
N0
,
// Ldb1
N1
);
// Ldc1
c1_buf
.
FromDevice
(
c1_host_dev
.
mData
.
data
());
c1_buf
.
FromDevice
(
c1_host_dev
.
mData
.
data
());
...
...
example/91_tile_program/gemm_softmax_gemm.hpp
View file @
6bc9ee05
...
@@ -8,7 +8,6 @@
...
@@ -8,7 +8,6 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "tile_program.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_elementwise.hpp"
#include "ck/tile_program/tile/tile_elementwise.hpp"
#include "ck/tile_program/tile/tile_gemm_shape.hpp"
#include "ck/tile_program/tile/tile_gemm_shape.hpp"
...
@@ -19,8 +18,7 @@
...
@@ -19,8 +18,7 @@
#include "ck/tile_program/block_tile/block_reduce.hpp"
#include "ck/tile_program/block_tile/block_reduce.hpp"
// C0 = A0 * B0
// C0 = A0 * B0
// D0 = softmax(C0)
// C1 = softmax(C0) * B1
// C1 = D0 * B1
template
<
typename
A0DataType
,
template
<
typename
A0DataType
,
typename
B0DataType
,
typename
B0DataType
,
typename
Acc0DataType
,
typename
Acc0DataType
,
...
@@ -57,7 +55,7 @@ struct GemmSoftmaxGemm
...
@@ -57,7 +55,7 @@ struct GemmSoftmaxGemm
#if 0
#if 0
// 2d
// 2d
__host__
__device__ static constexpr auto MakeB1LdsBlockDescriptor()
__device__ static constexpr auto MakeB1LdsBlockDescriptor()
{
{
using namespace ck;
using namespace ck;
...
@@ -71,7 +69,7 @@ struct GemmSoftmaxGemm
...
@@ -71,7 +69,7 @@ struct GemmSoftmaxGemm
}
}
#else
#else
// fake XOR
// fake XOR
__host__
__device__
static
constexpr
auto
MakeB1LdsBlockDescriptor
()
__device__
static
constexpr
auto
MakeB1LdsBlockDescriptor
()
{
{
using
namespace
ck
;
using
namespace
ck
;
...
@@ -103,7 +101,7 @@ struct GemmSoftmaxGemm
...
@@ -103,7 +101,7 @@ struct GemmSoftmaxGemm
}
}
#endif
#endif
__host__
__device__
static
constexpr
auto
MakeB1DramTileDistribution
()
__device__
static
constexpr
auto
MakeB1DramTileDistribution
()
{
{
using
namespace
ck
;
using
namespace
ck
;
using
namespace
ck
::
tile_program
;
using
namespace
ck
::
tile_program
;
...
@@ -128,7 +126,7 @@ struct GemmSoftmaxGemm
...
@@ -128,7 +126,7 @@ struct GemmSoftmaxGemm
Sequence
<
0
,
1
>>
{});
Sequence
<
0
,
1
>>
{});
}
}
__host__
__device__
static
constexpr
ck
::
index_t
GetStaticLdsSize
()
__device__
static
constexpr
ck
::
index_t
GetStaticLdsSize
()
{
{
using
namespace
ck
;
using
namespace
ck
;
...
@@ -137,19 +135,18 @@ struct GemmSoftmaxGemm
...
@@ -137,19 +135,18 @@ struct GemmSoftmaxGemm
sizeof
(
B1DataType
)));
sizeof
(
B1DataType
)));
}
}
__host__
__device__
void
operator
()(
ProgramServer
&
ps
,
__device__
void
operator
()(
const
A0DataType
*
p_a0
,
const
A0DataType
*
p_a0
,
const
B0DataType
*
p_b0
,
const
B0DataType
*
p_b0
,
const
B1DataType
*
p_b1
,
const
B1DataType
*
p_b1
,
C1DataType
*
p_c1
,
C1DataType
*
p_c1
,
ck
::
index_t
M0
,
ck
::
index_t
M0
,
ck
::
index_t
N0
,
ck
::
index_t
N0
,
ck
::
index_t
K0
,
ck
::
index_t
K0
,
ck
::
index_t
N1
,
ck
::
index_t
N1
,
ck
::
index_t
Lda0
,
ck
::
index_t
Lda0
,
ck
::
index_t
Ldb0
,
ck
::
index_t
Ldb0
,
ck
::
index_t
Ldb1
,
ck
::
index_t
Ldb1
,
ck
::
index_t
Ldc1
)
ck
::
index_t
Ldc1
)
{
{
using
namespace
ck
;
using
namespace
ck
;
using
namespace
ck
::
tile_program
;
using
namespace
ck
::
tile_program
;
...
@@ -169,17 +166,15 @@ struct GemmSoftmaxGemm
...
@@ -169,17 +166,15 @@ struct GemmSoftmaxGemm
p_b1
,
make_tuple
(
N1
,
N0
),
make_tuple
(
Ldb1
,
1
),
Number
<
32
>
{},
Number
<
1
>
{});
p_b1
,
make_tuple
(
N1
,
N0
),
make_tuple
(
Ldb1
,
1
),
Number
<
32
>
{},
Number
<
1
>
{});
// divide problem
// divide problem
const
auto
id_block
=
ps
.
get_block_id
();
const
auto
num_tile_m0
=
M0
/
kM0PerBlock
;
const
auto
num_tile_n1
=
N1
/
kN1PerBlock
;
const
auto
num_tile_n1
=
N1
/
kN1PerBlock
;
const
auto
block
2tile
=
ps
(
make_cluster_descriptor
(
make_tuple
(
num_tile_m0
,
num_tile_n1
))
);
const
auto
id_
block
=
get_block_id
(
);
const
auto
id_tile
=
block2tile
.
CalculateBottomIndex
(
make_tuple
(
id_block
));
const
auto
id_tile_m
=
id_block
/
num_tile_n1
;
const
auto
id_tile_n
=
id_block
-
id_tile_m
*
num_tile_n1
;
const
auto
iM0
=
ps
.
read
_
first
_
lane
(
id_tile
.
At
<
0
>
()
*
kM0PerBlock
);
const
auto
iM0
=
__builtin_amdgcn_
readfirstlane
(
id_tile
_m
*
kM0PerBlock
);
const
auto
iN1
=
ps
.
read
_
first
_
lane
(
id_tile
.
At
<
1
>
()
*
kN1PerBlock
);
const
auto
iN1
=
__builtin_amdgcn_
readfirstlane
(
id_tile
_n
*
kN1PerBlock
);
__shared__
char
p_smem_char
[
GetStaticLdsSize
()];
__shared__
char
p_smem_char
[
GetStaticLdsSize
()];
...
@@ -333,18 +328,18 @@ struct GemmSoftmaxGemm
...
@@ -333,18 +328,18 @@ struct GemmSoftmaxGemm
const
auto
b1_block_tile
=
load_tile
(
b1_dram_block_window
);
const
auto
b1_block_tile
=
load_tile
(
b1_dram_block_window
);
// wait for block gemm0 pipeline to finish
// wait for block gemm0 pipeline to finish
ps
.
block_sync_lds
();
block_sync_lds
();
store_tile
(
b1_lds_block_window
,
b1_block_tile
);
store_tile
(
b1_lds_block_window
,
b1_block_tile
);
// wait for store_tile to finish
// wait for store_tile to finish
ps
.
block_sync_lds
();
block_sync_lds
();
// acc1 += c0 * b1
// acc1 += c0 * b1
block_gemm1
(
acc1_block_tile
,
c0_block_tile
,
b1_lds_block_window
);
block_gemm1
(
acc1_block_tile
,
c0_block_tile
,
b1_lds_block_window
);
// wait for block gemm1 to finish
// wait for block gemm1 to finish
ps
.
block_sync_lds
();
block_sync_lds
();
}
}
// move tile windows
// move tile windows
...
...
example/91_tile_program/hello_world.cpp
deleted
100644 → 0
View file @
f3baea0d
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/io.hpp"
#include "tile_program.hpp"
#include "ck/library/utility/device_memory.hpp"
// ProgramServer contains a "meta data buffer"
// host evaluate the expression inside ps(), and push the result into meta data buffer
// ProgramServer send meta data buffer to GPU as kernel arguement
// device read (not evaluate) the value of the expression inside ps() from meta data buffer
struct
HelloWorld
{
__host__
__device__
void
operator
()(
ProgramServer
&
ps
,
int
x
,
int
y
,
int
*
res
)
{
#if 1
auto
r0
=
ps
(
x
+
y
);
auto
r1
=
ps
(
x
-
y
);
res
[
0
]
=
r0
;
res
[
1
]
=
r1
;
#elif 1
(
void
)
x
;
(
void
)
y
;
auto
r0
=
ps
.
get_thread_id
();
auto
r1
=
ps
.
warp_shuffle_up
(
r0
,
1
);
auto
r2
=
ps
.
warp_shuffle_down
(
r0
,
1
);
printf
(
"tid %d, r0 %d, r1 %d, r2 %d
\n
"
,
ps
.
get_thread_id
(),
r0
,
r1
,
r2
);
res
[
0
]
=
r0
;
res
[
1
]
=
r2
;
#endif
}
};
int
main
()
{
int
x
=
100
;
int
y
=
101
;
DeviceMem
res_dev_buf
(
2
*
sizeof
(
int
));
launch
(
ProgramServer
{},
HelloWorld
{},
1
,
64
,
x
,
y
,
static_cast
<
int
*>
(
res_dev_buf
.
GetDeviceBuffer
()));
int
res_host
[
2
];
res_dev_buf
.
FromDevice
(
&
res_host
);
printf
(
"x+y=: %d
\n
"
,
res_host
[
0
]);
printf
(
"x-y=: %d
\n
"
,
res_host
[
1
]);
return
0
;
}
example/91_tile_program/im2col.cpp
View file @
6bc9ee05
...
@@ -9,10 +9,9 @@
...
@@ -9,10 +9,9 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor_view.hpp"
#include "ck/tensor/tensor_view.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "tile_program.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_window.hpp"
#include "ck/tile_program/tile/tile_window.hpp"
#include "ck/tile_program/tile/load_tile.hpp"
#include "ck/tile_program/tile/load_tile.hpp"
...
@@ -105,10 +104,8 @@ struct Im2Col
...
@@ -105,10 +104,8 @@ struct Im2Col
Sequence
<
0
,
1
>>
{});
Sequence
<
0
,
1
>>
{});
}
}
template
<
typename
Server
>
__host__
__device__
void
__host__
__device__
void
operator
()(
Server
&
ps
,
operator
()(
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
2
>&
a_n_wis_c_lengths
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
2
>&
a_n_wis_c_lengths
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
2
>&
/* a_n_wis_c_strides */
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
2
>&
/* a_n_wis_c_strides */
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
2
>&
b_k_xs_c_lengths
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
2
>&
b_k_xs_c_lengths
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
2
>&
/* b_k_xs_c_strides */
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
2
>&
/* b_k_xs_c_strides */
,
...
@@ -118,10 +115,8 @@ struct Im2Col
...
@@ -118,10 +115,8 @@ struct Im2Col
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
>&
input_right_pads
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
>&
input_right_pads
,
//
const
std
::
array
<
ck
::
index_t
,
2
>
a_gemmm_gemmk_lengths
,
const
std
::
array
<
ck
::
index_t
,
2
>
a_gemmm_gemmk_lengths
,
const
std
::
array
<
ck
::
index_t
,
2
>
a_gemmm_gemmk_strides
,
const
std
::
array
<
ck
::
index_t
,
2
>
a_gemmm_gemmk_strides
,
//
const
T
*
p_a_img
,
const
T
*
p_a_img
,
T
*
p_a_mtx
)
T
*
p_a_mtx
)
{
{
...
@@ -176,8 +171,8 @@ struct Im2Col
...
@@ -176,8 +171,8 @@ struct Im2Col
const
auto
src_gemmm_gemmk
=
const
auto
src_gemmm_gemmk
=
transform_tensor_view
(
a_n_y_ho_x_wo_c
,
transform_tensor_view
(
a_n_y_ho_x_wo_c
,
make_tuple
(
ps
(
make_merge_transform
(
make_tuple
(
N
,
Ho
,
Wo
))
)
,
make_tuple
(
make_merge_transform
(
make_tuple
(
N
,
Ho
,
Wo
)),
ps
(
make_merge_transform
(
make_tuple
(
Y
,
X
,
C
)))
)
,
make_merge_transform
(
make_tuple
(
Y
,
X
,
C
))),
make_tuple
(
Sequence
<
0
,
2
,
4
>
{},
Sequence
<
1
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
,
2
,
4
>
{},
Sequence
<
1
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
...
@@ -191,15 +186,15 @@ struct Im2Col
...
@@ -191,15 +186,15 @@ struct Im2Col
const
auto
numGemmM
=
a_gemmm_gemmk_lengths
[
0
];
const
auto
numGemmM
=
a_gemmm_gemmk_lengths
[
0
];
const
auto
numGemmK
=
a_gemmm_gemmk_lengths
[
1
];
const
auto
numGemmK
=
a_gemmm_gemmk_lengths
[
1
];
const
auto
id_block
=
ps
.
get_block_id
();
const
auto
id_block
=
get_block_id
();
const
auto
num_tile_m
=
ps
.
read
_
first
_
lane
(
numGemmM
/
kMPerBlock
);
const
auto
num_tile_m
=
__builtin_amdgcn_
readfirstlane
(
numGemmM
/
kMPerBlock
);
const
auto
block2tile
=
ps
(
make_cluster_descriptor
(
make_tuple
(
num_tile_m
))
)
;
const
auto
block2tile
=
make_cluster_descriptor
(
make_tuple
(
num_tile_m
));
const
auto
i_gemmm_gemmk
=
block2tile
.
CalculateBottomIndex
(
make_multi_index
(
id_block
));
const
auto
i_gemmm_gemmk
=
block2tile
.
CalculateBottomIndex
(
make_multi_index
(
id_block
));
const
auto
iGemmM
=
ps
.
read
_
first
_
lane
(
i_gemmm_gemmk
[
0
])
*
kMPerBlock
;
const
auto
iGemmM
=
__builtin_amdgcn_
readfirstlane
(
i_gemmm_gemmk
[
0
])
*
kMPerBlock
;
// src window
// src window
auto
src_block_window
=
auto
src_block_window
=
...
@@ -327,26 +322,26 @@ int main()
...
@@ -327,26 +322,26 @@ int main()
ck
::
index_t
kGridSize
=
(
N
*
Ho
*
Wo
)
/
kGemmMPerBlock
;
ck
::
index_t
kGridSize
=
(
N
*
Ho
*
Wo
)
/
kGemmMPerBlock
;
float
ave_time
=
launch
(
ProgramServer
{},
float
ave_time
=
Im2Col
<
2
,
DataType
,
kBlockSize
,
kGemmMPerBlock
,
kGemmKPerBlock
>
{
},
launch_kernel
(
StreamConfig
{
nullptr
,
true
},
kGridSize
,
Im2Col
<
2
,
DataType
,
kBlockSize
,
kGemmMPerBlock
,
kGemmKPerBlock
>
{}
,
kBlock
Size
,
kGrid
Size
,
in_lengths
,
kBlockSize
,
in_strides
,
0
,
we
i_lengths
,
i
n
_lengths
,
we
i_strides
,
i
n
_strides
,
out
_lengths
,
wei
_lengths
,
out
_strides
,
wei
_strides
,
filter_stride
s
,
out_length
s
,
filter_dilation
s
,
out_stride
s
,
input_left_pad
s
,
filter_stride
s
,
input_right_pad
s
,
filter_dilation
s
,
//
input_left_pads
,
in_mtx_length
s
,
input_right_pad
s
,
in_mtx_
stride
s
,
in_mtx_
length
s
,
//
in_mtx_strides
,
static_cast
<
DataType
*>
(
in_buf
.
GetDeviceBuffer
()),
static_cast
<
DataType
*>
(
in_buf
.
GetDeviceBuffer
()),
static_cast
<
DataType
*>
(
in_mtx_buf
.
GetDeviceBuffer
()));
static_cast
<
DataType
*>
(
in_mtx_buf
.
GetDeviceBuffer
()));
std
::
size_t
num_btype
=
sizeof
(
DataType
)
*
in_host
.
GetElementSize
()
+
std
::
size_t
num_btype
=
sizeof
(
DataType
)
*
in_host
.
GetElementSize
()
+
sizeof
(
DataType
)
*
in_mtx_host_ref
.
GetElementSize
();
sizeof
(
DataType
)
*
in_mtx_host_ref
.
GetElementSize
();
...
...
example/91_tile_program/reduce.cpp
View file @
6bc9ee05
...
@@ -4,9 +4,8 @@
...
@@ -4,9 +4,8 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor_view.hpp"
#include "ck/tensor/tensor_view.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
...
@@ -39,9 +38,9 @@ void reference_reduce(const Tensor<ADataType>& a_m_n, Tensor<BDataType>& b_m)
...
@@ -39,9 +38,9 @@ void reference_reduce(const Tensor<ADataType>& a_m_n, Tensor<BDataType>& b_m)
int
main
(
int
argc
,
char
*
argv
[])
int
main
(
int
argc
,
char
*
argv
[])
{
{
using
ADataType
=
floa
t
;
using
ADataType
=
ck
::
half_
t
;
using
AccDataType
=
float
;
using
AccDataType
=
float
;
using
BDataType
=
floa
t
;
using
BDataType
=
ck
::
half_
t
;
ck
::
index_t
M
=
3328
;
ck
::
index_t
M
=
3328
;
ck
::
index_t
N
=
4096
;
ck
::
index_t
N
=
4096
;
...
@@ -84,14 +83,15 @@ int main(int argc, char* argv[])
...
@@ -84,14 +83,15 @@ int main(int argc, char* argv[])
const
auto
kernel
=
const
auto
kernel
=
Reduce
<
ADataType
,
AccDataType
,
BDataType
,
kBlockSize
,
kMPerBlock
,
kNPerBlock
>
{};
Reduce
<
ADataType
,
AccDataType
,
BDataType
,
kBlockSize
,
kMPerBlock
,
kNPerBlock
>
{};
float
ave_time
=
launch
(
ProgramServer
{},
float
ave_time
=
launch_kernel
(
StreamConfig
{
nullptr
,
true
},
kernel
,
kernel
,
kGridSize
,
kGridSize
,
kBlockSize
,
kBlockSize
,
static_cast
<
ADataType
*>
(
a_buf
.
GetDeviceBuffer
()),
0
,
static_cast
<
BDataType
*>
(
b_buf
.
GetDeviceBuffer
()),
static_cast
<
ADataType
*>
(
a_buf
.
GetDeviceBuffer
()),
M
,
static_cast
<
BDataType
*>
(
b_buf
.
GetDeviceBuffer
()),
N
);
M
,
N
);
b_buf
.
FromDevice
(
b_host_dev
.
mData
.
data
());
b_buf
.
FromDevice
(
b_host_dev
.
mData
.
data
());
...
...
example/91_tile_program/reduce.hpp
View file @
6bc9ee05
...
@@ -8,7 +8,6 @@
...
@@ -8,7 +8,6 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "tile_program.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_window.hpp"
#include "ck/tile_program/tile/tile_window.hpp"
#include "ck/tile_program/tile/load_tile.hpp"
#include "ck/tile_program/tile/load_tile.hpp"
...
@@ -25,7 +24,7 @@ template <typename ADataType,
...
@@ -25,7 +24,7 @@ template <typename ADataType,
struct
Reduce
struct
Reduce
{
{
#if 0
#if 0
__host__
__device__ static constexpr auto MakeABlockTileDistribution()
__device__ static constexpr auto MakeABlockTileDistribution()
{
{
using namespace ck;
using namespace ck;
using namespace ck::tile_program;
using namespace ck::tile_program;
...
@@ -40,7 +39,7 @@ struct Reduce
...
@@ -40,7 +39,7 @@ struct Reduce
Sequence<0, 0, 2, 4>>{});
Sequence<0, 0, 2, 4>>{});
}
}
#elif
0
#elif
0
__host__
__device__
static
constexpr
auto
MakeABlockTileDistribution
()
__device__
static
constexpr
auto
MakeABlockTileDistribution
()
{
{
using
namespace
ck
;
using
namespace
ck
;
using
namespace
ck
::
tile_program
;
using
namespace
ck
::
tile_program
;
...
@@ -55,7 +54,7 @@ struct Reduce
...
@@ -55,7 +54,7 @@ struct Reduce
Sequence
<
0
,
0
,
2
,
4
>>
{});
Sequence
<
0
,
0
,
2
,
4
>>
{});
}
}
#elif 1
#elif 1
__host__
__device__
static
constexpr
auto
MakeABlockTileDistribution
()
__device__
static
constexpr
auto
MakeABlockTileDistribution
()
{
{
using
namespace
ck
;
using
namespace
ck
;
using
namespace
ck
::
tile_program
;
using
namespace
ck
::
tile_program
;
...
@@ -71,8 +70,8 @@ struct Reduce
...
@@ -71,8 +70,8 @@ struct Reduce
}
}
#endif
#endif
__host__
__device__
void
operator
()(
__device__
void
ProgramServer
&
ps
,
const
ADataType
*
p_a
,
BDataType
*
p_b
,
ck
::
index_t
M
,
ck
::
index_t
N
)
const
operator
()(
const
ADataType
*
p_a
,
BDataType
*
p_b
,
ck
::
index_t
M
,
ck
::
index_t
N
)
const
{
{
using
namespace
ck
;
using
namespace
ck
;
using
namespace
ck
::
tile_program
;
using
namespace
ck
::
tile_program
;
...
@@ -81,7 +80,7 @@ struct Reduce
...
@@ -81,7 +80,7 @@ struct Reduce
const
auto
a_m_n
=
make_naive_tensor_view
<
AddressSpaceEnum
::
Global
>
(
const
auto
a_m_n
=
make_naive_tensor_view
<
AddressSpaceEnum
::
Global
>
(
p_a
,
make_tuple
(
M
,
N
),
make_tuple
(
N
,
1
),
Number
<
32
>
{},
Number
<
1
>
{});
p_a
,
make_tuple
(
M
,
N
),
make_tuple
(
N
,
1
),
Number
<
32
>
{},
Number
<
1
>
{});
const
auto
iM
=
ps
.
get_block_id
()
*
kMPerBlock
;
const
auto
iM
=
get_block_id
()
*
kMPerBlock
;
// A window
// A window
auto
a_block_window
=
auto
a_block_window
=
...
...
example/91_tile_program/softmax.cpp
View file @
6bc9ee05
...
@@ -4,9 +4,8 @@
...
@@ -4,9 +4,8 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor_view.hpp"
#include "ck/tensor/tensor_view.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
...
@@ -64,14 +63,15 @@ int main(int argc, char* argv[])
...
@@ -64,14 +63,15 @@ int main(int argc, char* argv[])
const
auto
kernel
=
const
auto
kernel
=
Softmax
<
ADataType
,
AccDataType
,
BDataType
,
kBlockSize
,
kMPerBlock
,
kNPerBlock
>
{};
Softmax
<
ADataType
,
AccDataType
,
BDataType
,
kBlockSize
,
kMPerBlock
,
kNPerBlock
>
{};
float
ave_time
=
launch
(
ProgramServer
{},
float
ave_time
=
launch_kernel
(
StreamConfig
{
nullptr
,
true
},
kernel
,
kernel
,
kGridSize
,
kGridSize
,
kBlockSize
,
kBlockSize
,
static_cast
<
ADataType
*>
(
a_buf
.
GetDeviceBuffer
()),
0
,
static_cast
<
BDataType
*>
(
b_buf
.
GetDeviceBuffer
()),
static_cast
<
ADataType
*>
(
a_buf
.
GetDeviceBuffer
()),
M
,
static_cast
<
BDataType
*>
(
b_buf
.
GetDeviceBuffer
()),
N
);
M
,
N
);
b_buf
.
FromDevice
(
b_host_dev
.
mData
.
data
());
b_buf
.
FromDevice
(
b_host_dev
.
mData
.
data
());
...
@@ -81,8 +81,5 @@ int main(int argc, char* argv[])
...
@@ -81,8 +81,5 @@ int main(int argc, char* argv[])
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"dev: "
,
b_host_dev
.
mData
,
", "
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"ref: "
,
b_host_ref
.
mData
,
", "
)
<<
std
::
endl
;
return
!
ck
::
utils
::
check_err
(
b_host_dev
,
b_host_ref
);
return
!
ck
::
utils
::
check_err
(
b_host_dev
,
b_host_ref
);
}
}
example/91_tile_program/softmax.hpp
View file @
6bc9ee05
...
@@ -8,7 +8,6 @@
...
@@ -8,7 +8,6 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "tile_program.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_window.hpp"
#include "ck/tile_program/tile/tile_window.hpp"
#include "ck/tile_program/tile/load_tile.hpp"
#include "ck/tile_program/tile/load_tile.hpp"
...
@@ -25,7 +24,7 @@ template <typename ADataType,
...
@@ -25,7 +24,7 @@ template <typename ADataType,
struct
Softmax
struct
Softmax
{
{
#if 0
#if 0
__host__
__device__ static constexpr auto MakeABlockTileDistribution()
__device__ static constexpr auto MakeABlockTileDistribution()
{
{
using namespace ck;
using namespace ck;
using namespace ck::tile_program;
using namespace ck::tile_program;
...
@@ -40,7 +39,7 @@ struct Softmax
...
@@ -40,7 +39,7 @@ struct Softmax
Sequence<0, 0, 2, 4>>{});
Sequence<0, 0, 2, 4>>{});
}
}
#elif
0
#elif
0
__host__
__device__
static
constexpr
auto
MakeABlockTileDistribution
()
__device__
static
constexpr
auto
MakeABlockTileDistribution
()
{
{
using
namespace
ck
;
using
namespace
ck
;
using
namespace
ck
::
tile_program
;
using
namespace
ck
::
tile_program
;
...
@@ -55,7 +54,7 @@ struct Softmax
...
@@ -55,7 +54,7 @@ struct Softmax
Sequence
<
0
,
0
,
2
,
4
>>
{});
Sequence
<
0
,
0
,
2
,
4
>>
{});
}
}
#elif 1
#elif 1
__host__
__device__
static
constexpr
auto
MakeABlockTileDistribution
()
__device__
static
constexpr
auto
MakeABlockTileDistribution
()
{
{
using
namespace
ck
;
using
namespace
ck
;
using
namespace
ck
::
tile_program
;
using
namespace
ck
::
tile_program
;
...
@@ -71,8 +70,8 @@ struct Softmax
...
@@ -71,8 +70,8 @@ struct Softmax
}
}
#endif
#endif
__host__
__device__
void
operator
()(
__device__
void
ProgramServer
&
ps
,
const
ADataType
*
p_a
,
BDataType
*
p_b
,
ck
::
index_t
M
,
ck
::
index_t
N
)
const
operator
()(
const
ADataType
*
p_a
,
BDataType
*
p_b
,
ck
::
index_t
M
,
ck
::
index_t
N
)
const
{
{
using
namespace
ck
;
using
namespace
ck
;
using
namespace
ck
::
tile_program
;
using
namespace
ck
::
tile_program
;
...
@@ -84,7 +83,7 @@ struct Softmax
...
@@ -84,7 +83,7 @@ struct Softmax
const
auto
a_m_n
=
make_naive_tensor_view
<
AddressSpaceEnum
::
Global
>
(
const
auto
a_m_n
=
make_naive_tensor_view
<
AddressSpaceEnum
::
Global
>
(
p_a
,
make_tuple
(
M
,
N
),
make_tuple
(
N
,
1
),
Number
<
32
>
{},
Number
<
1
>
{});
p_a
,
make_tuple
(
M
,
N
),
make_tuple
(
N
,
1
),
Number
<
32
>
{},
Number
<
1
>
{});
const
auto
iM
=
ps
.
get_block_id
()
*
kMPerBlock
;
const
auto
iM
=
get_block_id
()
*
kMPerBlock
;
// A window
// A window
auto
a_block_window
=
auto
a_block_window
=
...
...
example/91_tile_program/tile_program.hpp
deleted
100644 → 0
View file @
f3baea0d
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <hip/hip_runtime.h>
#include "ck/ck.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
// Meta data for GPU
// TODO: do we need to take care of data alignment in code or it's done by compiler?
template
<
ck
::
index_t
kSize
>
struct
MetaData
{
char
p_data_
[
kSize
];
ck
::
index_t
size_
=
0
;
ck
::
index_t
pos_
=
0
;
__host__
__device__
void
reset
()
{
size_
=
0
;
pos_
=
0
;
}
__device__
void
reset_pos
()
{
pos_
=
0
;
}
// push meta data on host
// TODO: correct forwarding?
template
<
typename
T
>
__host__
auto
push
(
T
&&
a
)
{
using
Type
=
ck
::
remove_cvref_t
<
T
>
;
static_assert
(
std
::
is_trivially_copy_constructible_v
<
Type
>
&&
std
::
is_trivially_destructible_v
<
Type
>
);
assert
(
size_
+
sizeof
(
Type
)
<=
kSize
);
// use placement new to create object copy
new
(
p_data_
+
size_
)
Type
(
std
::
forward
<
T
>
(
a
));
size_
+=
sizeof
(
Type
);
return
ck
::
forwarder
{}(
a
);
}
// pull meta data on device
// TODO: correct forwarding?
template
<
typename
T
>
__device__
auto
pull
()
{
using
Type
=
ck
::
remove_cvref_t
<
T
>
;
static_assert
(
std
::
is_trivially_copy_constructible_v
<
Type
>
&&
std
::
is_trivially_destructible_v
<
Type
>
);
Type
a
(
*
reinterpret_cast
<
Type
*>
(
p_data_
+
pos_
));
pos_
+=
sizeof
(
Type
);
return
a
;
}
};
// namespace tp (for tile programming)
struct
ProgramServer
{
// meta data on device
MetaData
<
1024
>
meta_data_
;
__host__
void
cpu_init
()
{
meta_data_
.
reset
();
}
__device__
void
gpu_init
()
{
meta_data_
.
reset_pos
();
}
// push meta data on host
template
<
typename
T
>
__host__
auto
operator
()(
T
&&
a
)
{
return
ck
::
forwarder
{}(
meta_data_
.
push
(
a
));
}
// push meta data on host
template
<
typename
T
>
__device__
auto
operator
()(
T
&&
)
{
return
ck
::
forwarder
{}(
meta_data_
.
pull
<
T
>
());
}
//
__host__
static
ck
::
index_t
get_block_id
()
{
return
-
1
;
}
__host__
static
ck
::
index_t
get_thread_id
()
{
return
-
1
;
}
__host__
static
ck
::
index_t
get_grid_size
()
{
return
-
1
;
}
__host__
static
void
block_sync_lds
()
{}
// TODO: correct forwarding?
template
<
typename
T
>
__host__
static
constexpr
auto
read_first_lane
(
T
&&
a
)
{
return
ck
::
forwarder
{}(
a
);
}
template
<
typename
T
>
__host__
T
warp_shuffle_up
(
T
,
uint32_t
)
{
return
0
;
}
template
<
typename
T
>
__host__
T
warp_shuffle_down
(
T
,
uint32_t
)
{
return
0
;
}
//
__device__
static
ck
::
index_t
get_block_id
()
{
return
ck
::
get_block_id
();
}
__device__
static
ck
::
index_t
get_thread_id
()
{
return
ck
::
get_thread_id
();
}
__device__
static
ck
::
index_t
get_grid_size
()
{
return
ck
::
get_grid_size
();
}
__device__
static
void
block_sync_lds
()
{
ck
::
block_sync_lds
();
}
template
<
typename
T
>
__device__
static
constexpr
auto
read_first_lane
(
T
&&
a
)
{
return
__builtin_amdgcn_readfirstlane
(
a
);
}
template
<
typename
T
>
__device__
T
warp_shuffle_up
(
const
T
&
var
,
uint32_t
delta
)
{
return
ck
::
warp_shuffle_up
(
var
,
delta
);
}
template
<
typename
T
>
__device__
T
warp_shuffle_down
(
const
T
&
var
,
uint32_t
delta
)
{
return
ck
::
warp_shuffle_down
(
var
,
delta
);
}
};
template
<
typename
Server
,
typename
Program
,
typename
...
Xs
>
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
__global__
void
gpu_program_wrapper
(
Server
server
,
Program
f
,
Xs
...
xs
)
{
server
.
gpu_init
();
f
(
server
,
xs
...);
}
template
<
typename
Server
,
typename
Program
,
typename
...
Xs
>
float
launch
(
Server
server
,
Program
f
,
dim3
grid_dim
,
dim3
block_dim
,
Xs
...
xs
)
{
server
.
cpu_init
();
f
(
server
,
xs
...);
printf
(
"meta data size %d
\n
"
,
server
.
meta_data_
.
size_
);
printf
(
"%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d}
\n
"
,
__func__
,
grid_dim
.
x
,
grid_dim
.
y
,
grid_dim
.
z
,
block_dim
.
x
,
block_dim
.
y
,
block_dim
.
z
);
#if 0
gpu_program_wrapper<Server, Program><<<grid_dim, block_dim, 0, nullptr>>>(server, f, xs...);
#else
return
launch_and_time_kernel
(
StreamConfig
{
nullptr
,
true
,
0
},
gpu_program_wrapper
<
Server
,
Program
,
Xs
...
>
,
grid_dim
,
block_dim
,
0
,
server
,
f
,
xs
...);
#endif
}
include/ck/host_utility/kernel_launch.hpp
View file @
6bc9ee05
...
@@ -9,6 +9,15 @@
...
@@ -9,6 +9,15 @@
#include "ck/stream_config.hpp"
#include "ck/stream_config.hpp"
#include "ck/host_utility/hip_check_error.hpp"
#include "ck/host_utility/hip_check_error.hpp"
template
<
int
MaxThreadPerBlock
,
int
MinBlockPerCu
,
typename
Kernel
,
typename
...
Args
>
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
MaxThreadPerBlock
,
MinBlockPerCu
)
#endif
__global__
void
kernel_wrapper
(
Kernel
f
,
Args
...
args
)
{
f
(
args
...);
}
template
<
typename
...
Args
,
typename
F
>
template
<
typename
...
Args
,
typename
F
>
float
launch_and_time_kernel
(
const
StreamConfig
&
stream_config
,
float
launch_and_time_kernel
(
const
StreamConfig
&
stream_config
,
F
kernel
,
F
kernel
,
...
@@ -142,3 +151,20 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
...
@@ -142,3 +151,20 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
return
0
;
return
0
;
#endif
#endif
}
}
template
<
int
MaxThreadPerBlock
=
CK_MAX_THREAD_PER_BLOCK
,
int
MinBlockPerCu
=
CK_MIN_BLOCK_PER_CU
,
typename
KernelImpl
,
typename
...
Args
>
float
launch_kernel
(
const
StreamConfig
&
stream_config
,
KernelImpl
kernel_impl
,
dim3
grid_dim
,
dim3
block_dim
,
std
::
size_t
lds_byte
,
Args
...
args
)
{
const
auto
kernel
=
kernel_wrapper
<
MaxThreadPerBlock
,
MinBlockPerCu
,
KernelImpl
,
Args
...
>
;
return
launch_and_time_kernel
(
stream_config
,
kernel
,
grid_dim
,
block_dim
,
lds_byte
,
kernel_impl
,
args
...);
}
include/ck/tile_program/block_tile/block_gemm_areg_bsmem_creg_v1.hpp
View file @
6bc9ee05
...
@@ -340,63 +340,6 @@ struct BlockGemmARegBSmemCRegV1
...
@@ -340,63 +340,6 @@ struct BlockGemmARegBSmemCRegV1
return
c_block_tensor
;
return
c_block_tensor
;
}
}
// FIXME: remove: dummy host function for tile programming
template
<
typename
CBlockTensor
,
typename
ABlockTensorTmp
,
typename
BBlockWindowTmp
>
__host__
void
operator
()(
CBlockTensor
&
,
const
ABlockTensorTmp
&
,
const
BBlockWindowTmp
&
)
const
{
}
// FIXME: remove: dummy host function for tile programming
template
<
typename
ABlockTensorTmp
,
typename
BBlockWindowTmp
>
__host__
auto
operator
()(
const
ABlockTensorTmp
&
,
const
BBlockWindowTmp
&
)
const
{
static_assert
(
is_same_v
<
ADataType
,
remove_cv_t
<
typename
ABlockTensorTmp
::
DataType
>>
&&
is_same_v
<
BDataType
,
remove_cv_t
<
typename
BBlockWindowTmp
::
DataType
>>
,
"wrong!"
);
constexpr
index_t
MPerBlock
=
ABlockTensorTmp
{}.
GetLengths
()[
Number
<
0
>
{}];
constexpr
index_t
NPerBlock
=
BBlockWindowTmp
{}.
GetWindowLengths
()[
Number
<
0
>
{}];
static_assert
(
MPerBlock
==
BlockGemmShape
::
kM
&&
NPerBlock
==
BlockGemmShape
::
kN
,
"wrong!"
);
constexpr
auto
config
=
Policy
::
template
GetWarpGemmMWarpNWarp
<
Problem
>();
using
WG
=
remove_cvref_t
<
decltype
(
config
.
template
At
<
0
>())
>
;
constexpr
index_t
MWarp
=
config
.
template
At
<
1
>();
constexpr
index_t
NWarp
=
config
.
template
At
<
2
>();
constexpr
index_t
MIterPerWarp
=
MPerBlock
/
(
MWarp
*
WG
::
kM
);
constexpr
index_t
NIterPerWarp
=
NPerBlock
/
(
NWarp
*
WG
::
kN
);
constexpr
auto
c_block_outer_dstr_encoding
=
StaticTileDistributionEncoding
<
Sequence
<>
,
Tuple
<
Sequence
<
MIterPerWarp
,
MWarp
>
,
Sequence
<
NIterPerWarp
,
NWarp
>>
,
Tuple
<
Sequence
<
1
,
2
>>
,
Tuple
<
Sequence
<
1
,
1
>>
,
Sequence
<
1
,
2
>
,
Sequence
<
0
,
0
>>
{};
constexpr
auto
c_block_dstr_encode
=
detail
::
make_embed_tile_distribution_encoding
(
c_block_outer_dstr_encoding
,
typename
WG
::
CWarpDstrEncoding
{});
constexpr
auto
c_block_dstr
=
make_static_tile_distribution
(
c_block_dstr_encode
);
#if 0
// FIXME: need method to check a_block_tensor and a_block_tensor_tmp have equivalent distribution
static_assert(
is_same_v<remove_cvref_t<decltype(a_block_dstr_encode)>,
remove_cvref_t<decltype(
ABlockTensorTmp::GetBlockDistribution().GetStaticTensorDistributionEncoding())>>,
"wrong!");
#endif
// Construct C-Block-Tensor
auto
c_block_tensor
=
make_static_distributed_tensor
<
CDataType
>
(
c_block_dstr
);
return
c_block_tensor
;
}
};
};
}
// namespace block
}
// namespace block
...
...
include/ck/tile_program/block_tile/block_gemm_asmem_bsmem_creg_v1.hpp
View file @
6bc9ee05
...
@@ -331,55 +331,6 @@ struct BlockGemmASmemBSmemCRegV1
...
@@ -331,55 +331,6 @@ struct BlockGemmASmemBSmemCRegV1
return
c_block_tensor
;
return
c_block_tensor
;
}
}
// FIXME: remove: dummy host function for tile programming
template
<
typename
CBlockTensor
,
typename
ABlockWindowTmp
,
typename
BBlockWindowTmp
>
__host__
void
operator
()(
CBlockTensor
&
,
const
ABlockWindowTmp
&
,
const
BBlockWindowTmp
&
)
const
{
}
// FIXME: remove: dummy host function for tile programming
template
<
typename
ABlockWindowTmp
,
typename
BBlockWindowTmp
>
__host__
auto
operator
()(
const
ABlockWindowTmp
&
,
const
BBlockWindowTmp
&
)
const
{
static_assert
(
is_same_v
<
ADataType
,
typename
ABlockWindowTmp
::
DataType
>
&&
is_same_v
<
BDataType
,
typename
BBlockWindowTmp
::
DataType
>
,
"wrong!"
);
constexpr
index_t
MPerBlock
=
ABlockWindowTmp
{}.
GetWindowLengths
()[
Number
<
0
>
{}];
constexpr
index_t
NPerBlock
=
BBlockWindowTmp
{}.
GetWindowLengths
()[
Number
<
0
>
{}];
static_assert
(
MPerBlock
==
BlockGemmShape
::
kM
&&
NPerBlock
==
BlockGemmShape
::
kN
,
"wrong!"
);
constexpr
auto
config
=
Policy
::
template
GetWarpGemmMWarpNWarp
<
Problem
>();
using
WG
=
remove_cvref_t
<
decltype
(
config
.
template
At
<
0
>())
>
;
constexpr
index_t
MWarp
=
config
.
template
At
<
1
>();
constexpr
index_t
NWarp
=
config
.
template
At
<
2
>();
constexpr
index_t
MIterPerWarp
=
MPerBlock
/
(
MWarp
*
WG
::
kM
);
constexpr
index_t
NIterPerWarp
=
NPerBlock
/
(
NWarp
*
WG
::
kN
);
constexpr
auto
c_block_outer_dstr_encoding
=
StaticTileDistributionEncoding
<
Sequence
<>
,
Tuple
<
Sequence
<
MIterPerWarp
,
MWarp
>
,
Sequence
<
NIterPerWarp
,
NWarp
>>
,
Tuple
<
Sequence
<
1
,
2
>>
,
Tuple
<
Sequence
<
1
,
1
>>
,
Sequence
<
1
,
2
>
,
Sequence
<
0
,
0
>>
{};
constexpr
auto
c_block_dstr_encode
=
detail
::
make_embed_tile_distribution_encoding
(
c_block_outer_dstr_encoding
,
typename
WG
::
CWarpDstrEncoding
{});
constexpr
auto
c_block_dstr
=
make_static_tile_distribution
(
c_block_dstr_encode
);
static_assert
(
is_same_v
<
CDataType
,
typename
WG
::
CDataType
>
,
"wrong!"
);
auto
c_block_tensor
=
make_static_distributed_tensor
<
CDataType
>
(
c_block_dstr
);
return
c_block_tensor
;
}
};
};
}
// namespace block
}
// namespace block
...
...
include/ck/tile_program/block_tile/block_reduce.hpp
View file @
6bc9ee05
...
@@ -210,52 +210,6 @@ __device__ auto block_tile_reduce(const InDistributedTensor_& in_tensor,
...
@@ -210,52 +210,6 @@ __device__ auto block_tile_reduce(const InDistributedTensor_& in_tensor,
return
acc_tensor
;
return
acc_tensor
;
}
}
// FIXME: dummy host function for tile program
template
<
typename
AccDistributedTensor_
,
typename
InDistributedTensor_
,
index_t
...
InReduceDims
,
typename
ReduceFunc
>
__host__
void
block_tile_reduce
(
AccDistributedTensor_
&
,
const
InDistributedTensor_
&
,
Sequence
<
InReduceDims
...
>
,
const
ReduceFunc
&
)
{
}
// FIXME: dummy host function for tile program
template
<
typename
AccDataType_
,
typename
InDistributedTensor_
,
index_t
...
InReduceDims
,
typename
ReduceFunc
,
typename
InDataType_
>
__host__
auto
block_tile_reduce
(
const
InDistributedTensor_
&
,
Sequence
<
InReduceDims
...
>
,
const
ReduceFunc
&
,
const
InDataType_
&
)
{
using
InDataType
=
typename
InDistributedTensor_
::
DataType
;
using
AccDataType
=
remove_cvref_t
<
AccDataType_
>
;
static_assert
(
is_same_v
<
InDataType
,
remove_cvref_t
<
InDataType_
>>
,
"wrong!"
);
// declare acc_tensor
constexpr
auto
acc_dstr
=
make_static_tile_distribution
(
ck
::
tile_program
::
detail
::
make_reduce_tile_distribution_encoding
(
InDistributedTensor_
::
GetTileDistribution
().
GetStaticTileDistributionEncoding
(),
Sequence
<
InReduceDims
...
>
{}));
auto
acc_tensor
=
make_static_distributed_tensor
<
AccDataType
>
(
acc_dstr
);
return
acc_tensor
;
}
// FIXME: dummy host function for tile program
template
<
typename
AccDistributedTensor_
,
typename
ReduceFunc
>
__host__
void
block_tile_reduce_sync
(
AccDistributedTensor_
&
,
const
ReduceFunc
&
)
{
}
}
// namespace block
}
// namespace block
}
// namespace tile_program
}
// namespace tile_program
}
// namespace ck
}
// namespace ck
include/ck/tile_program/block_tile_pipeline/block_gemm_pipeline_agmem_bgmem_creg_v1.hpp
View file @
6bc9ee05
...
@@ -159,12 +159,12 @@ struct BlockGemmPipelineAGmemBGmemCRegV1
...
@@ -159,12 +159,12 @@ struct BlockGemmPipelineAGmemBGmemCRegV1
a_block_tile
=
load_tile
(
a_copy_dram_window
);
a_block_tile
=
load_tile
(
a_copy_dram_window
);
b_block_tile
=
load_tile
(
b_copy_dram_window
);
b_block_tile
=
load_tile
(
b_copy_dram_window
);
ProgramServer
::
block_sync_lds
();
block_sync_lds
();
// GEMM i
// GEMM i
block_gemm
(
c_block_tile
,
a_lds_gemm_window
,
b_lds_gemm_window
);
block_gemm
(
c_block_tile
,
a_lds_gemm_window
,
b_lds_gemm_window
);
ProgramServer
::
block_sync_lds
();
block_sync_lds
();
// move to i + 2
// move to i + 2
move_tile_window
(
a_copy_dram_window
,
{
0
,
kKPerBlock
});
move_tile_window
(
a_copy_dram_window
,
{
0
,
kKPerBlock
});
...
@@ -184,7 +184,7 @@ struct BlockGemmPipelineAGmemBGmemCRegV1
...
@@ -184,7 +184,7 @@ struct BlockGemmPipelineAGmemBGmemCRegV1
// tail
// tail
{
{
ProgramServer
::
block_sync_lds
();
block_sync_lds
();
// GEMM num_loop - 1
// GEMM num_loop - 1
block_gemm
(
c_block_tile
,
a_lds_gemm_window
,
b_lds_gemm_window
);
block_gemm
(
c_block_tile
,
a_lds_gemm_window
,
b_lds_gemm_window
);
...
@@ -194,10 +194,10 @@ struct BlockGemmPipelineAGmemBGmemCRegV1
...
@@ -194,10 +194,10 @@ struct BlockGemmPipelineAGmemBGmemCRegV1
}
}
template
<
typename
ADramBlockWindowTmp
,
typename
BDramBlockWindowTmp
>
template
<
typename
ADramBlockWindowTmp
,
typename
BDramBlockWindowTmp
>
__host__
__device__
auto
operator
()(
const
ADramBlockWindowTmp
&
a_dram_block_window_tmp
,
__device__
auto
operator
()(
const
ADramBlockWindowTmp
&
a_dram_block_window_tmp
,
const
BDramBlockWindowTmp
&
b_dram_block_window_tmp
,
const
BDramBlockWindowTmp
&
b_dram_block_window_tmp
,
index_t
num_loop
,
index_t
num_loop
,
void
*
p_smem
)
const
void
*
p_smem
)
const
{
{
return
operator
()(
return
operator
()(
a_dram_block_window_tmp
,
a_dram_block_window_tmp
,
...
...
include/ck/tile_program/block_tile_pipeline/block_gemm_pipeline_agmem_bgmem_creg_v2.hpp
View file @
6bc9ee05
...
@@ -161,12 +161,12 @@ struct BlockGemmPipelineAGmemBGmemCRegV2
...
@@ -161,12 +161,12 @@ struct BlockGemmPipelineAGmemBGmemCRegV2
do
do
{
{
ProgramServer
::
block_sync_lds
();
block_sync_lds
();
// GEMM i
// GEMM i
block_gemm
(
c_block_tile
,
a_lds_gemm_window
,
b_lds_gemm_window
);
block_gemm
(
c_block_tile
,
a_lds_gemm_window
,
b_lds_gemm_window
);
ProgramServer
::
block_sync_lds
();
block_sync_lds
();
// move to i + 2
// move to i + 2
move_tile_window
(
a_copy_dram_window
,
{
0
,
kKPerBlock
});
move_tile_window
(
a_copy_dram_window
,
{
0
,
kKPerBlock
});
...
@@ -190,12 +190,12 @@ struct BlockGemmPipelineAGmemBGmemCRegV2
...
@@ -190,12 +190,12 @@ struct BlockGemmPipelineAGmemBGmemCRegV2
// tail
// tail
{
{
ProgramServer
::
block_sync_lds
();
block_sync_lds
();
// GEMM num_loop - 2
// GEMM num_loop - 2
block_gemm
(
c_block_tile
,
a_lds_gemm_window
,
b_lds_gemm_window
);
block_gemm
(
c_block_tile
,
a_lds_gemm_window
,
b_lds_gemm_window
);
ProgramServer
::
block_sync_lds
();
block_sync_lds
();
// LDS write num_loop - 1
// LDS write num_loop - 1
const
auto
a_block_tile_tmp
=
tile_elementwise_in
(
a_element_func
,
a_block_tile
);
const
auto
a_block_tile_tmp
=
tile_elementwise_in
(
a_element_func
,
a_block_tile
);
...
@@ -204,7 +204,7 @@ struct BlockGemmPipelineAGmemBGmemCRegV2
...
@@ -204,7 +204,7 @@ struct BlockGemmPipelineAGmemBGmemCRegV2
const
auto
b_block_tile_tmp
=
tile_elementwise_in
(
b_element_func
,
b_block_tile
);
const
auto
b_block_tile_tmp
=
tile_elementwise_in
(
b_element_func
,
b_block_tile
);
store_tile
(
b_copy_lds_window
,
b_block_tile_tmp
);
store_tile
(
b_copy_lds_window
,
b_block_tile_tmp
);
ProgramServer
::
block_sync_lds
();
block_sync_lds
();
// GEMM num_loop - 1
// GEMM num_loop - 1
block_gemm
(
c_block_tile
,
a_lds_gemm_window
,
b_lds_gemm_window
);
block_gemm
(
c_block_tile
,
a_lds_gemm_window
,
b_lds_gemm_window
);
...
@@ -214,10 +214,10 @@ struct BlockGemmPipelineAGmemBGmemCRegV2
...
@@ -214,10 +214,10 @@ struct BlockGemmPipelineAGmemBGmemCRegV2
}
}
template
<
typename
ADramBlockWindowTmp
,
typename
BDramBlockWindowTmp
>
template
<
typename
ADramBlockWindowTmp
,
typename
BDramBlockWindowTmp
>
__host__
__device__
auto
operator
()(
const
ADramBlockWindowTmp
&
a_dram_block_window_tmp
,
__device__
auto
operator
()(
const
ADramBlockWindowTmp
&
a_dram_block_window_tmp
,
const
BDramBlockWindowTmp
&
b_dram_block_window_tmp
,
const
BDramBlockWindowTmp
&
b_dram_block_window_tmp
,
index_t
num_loop
,
index_t
num_loop
,
void
*
p_smem
)
const
void
*
p_smem
)
const
{
{
return
operator
()(
return
operator
()(
a_dram_block_window_tmp
,
a_dram_block_window_tmp
,
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment