Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
c5b3f69f
Commit
c5b3f69f
authored
Feb 15, 2023
by
Chao Liu
Browse files
pseudo code
parent
800f0ab3
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
422 additions
and
0 deletions
+422
-0
example/91_tile_program/copy.cpp
example/91_tile_program/copy.cpp
+34
-0
example/91_tile_program/gemm.cpp
example/91_tile_program/gemm.cpp
+186
-0
example/91_tile_program/hello_world.cpp
example/91_tile_program/hello_world.cpp
+2
-0
example/91_tile_program/im2col.cpp
example/91_tile_program/im2col.cpp
+200
-0
No files found.
example/91_tile_program/copy.cpp
0 → 100644
View file @
c5b3f69f
#include "tile_program.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/io.hpp"
#include "ck/library/utility/device_memory.hpp"
// program
struct
GemmMultiplD
{
__host__
__device__
void
operator
()(
TileProgram
&
tp
,
int
x
,
int
y
)
{
auto
desc
=
tp
.
make_naive_tensor_descriptor_packed
(
ck
::
make_tuple
(
x
));
printf
(
"length %d
\n
"
,
desc
.
GetLength
(
ck
::
Number
<
0
>
{}));
}
};
int
main
()
{
int
x
=
100
;
int
y
=
101
;
launch
(
HelloWorld
{},
1
,
1
,
x
,
y
);
return
0
;
}
example/91_tile_program/gemm.cpp
0 → 100644
View file @
c5b3f69f
template
<
typename
ADataType
,
typename
BDataType
,
typename
DsDataType
,
typename
EDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CDEElementwiseOperation
>
struct
GemmMultiD
{
static
constexpr
index_t
NumDTensor
=
DsDataType
::
Size
();
__host__
__device__
void
operator
()(
TileProgram
&
tp
,
const
std
::
array
<
index_t
,
2
>
a_m_k_lengths
,
const
std
::
array
<
index_t
,
2
>
a_m_k_strides
,
const
std
::
array
<
index_t
,
2
>
b_n_k_lengths
,
const
std
::
array
<
index_t
,
2
>
b_n_k_strides
,
const
std
::
array
<
const
std
::
array
<
index_t
,
2
>
,
NumDTensor
>
ds_m_n_lengths
,
const
std
::
array
<
const
std
::
array
<
index_t
,
2
>
,
NumDTensor
>
ds_m_n_strides
,
const
std
::
array
<
index_t
,
2
>
e_m_n_lengths
,
const
std
::
array
<
index_t
,
2
>
e_m_n_strides
,
//
const
T
*
p_a
,
const
T
*
p_b
,
const
std
::
array
<
const
T
*>
p_ds
,
T
*
p_e
)
{
using
namespace
ck
;
const
auto
b
=
tp
(
make_naive_tensor
(
b_n_k_lengths
,
b_n_k_strides
),
p_b
);
const
auto
ds
=
tp
(
generate_tuple
(
[
&
](
auto
i
)
{
return
make_naive_tensor
(
ds_m_n_lengths
[
i
],
ds_m_n_strides
[
i
],
p_ds
[
i
]),
},
Number
<
NumDTensor
>
{}));
auto
e
=
tp
(
make_naive_tensor
(
e_m_n_lengths
,
e_m_n_strides
),
p_e
);
// divide problem
const
auto
num_m
=
e_m_n_lengths
[
0
];
const
auto
num_n
=
e_m_n_lengths
[
1
];
const
auto
id_block
=
get_block_1d_id
();
const
auto
num_tile_m
=
num_gemmm
/
MPerTile
;
const
auto
num_tile_n
=
num_gemmn
/
NPerTile
;
const
auto
block2tile
=
tp
(
make_cluster_descriptor
(
make_tuple
(
num_tile_m
,
num_tile_n
)));
const
auto
id_tile
=
block2tile
.
CalculateBottonIndex
(
id_block
);
const
auto
id_tile_m
=
id_tile
.
At
<
0
>
();
const
auto
id_tile_n
=
id_tile
.
At
<
1
>
();
// A/B in DRAM
// A/B DRAM layout is part of problem, not solution
#if 1
// DO NOT let user know there is optimization on tensor transform on A/B DRAM tensor
const
auto
a_dram_global
=
tp
(
make_naive_tensor
(
a_m_k_lengths
,
a_m_k_strides
),
p_a_dram
);
const
auto
b_dram_global
=
tp
(
make_naive_tensor
(
b_n_k_lengths
,
b_n_k_strides
),
p_b_dram
);
#endif
// A/B tile in LDS
// A/B DRAM layout is part of solution
ADataType
*
p_a_lds
=
shared_memmory
.
get_pointer
(
0
);
// [allow optimization] allow different LDS layouts
constexpr
auto
a_lds_block
=
make_tensor
(
p_a_lds
,
{
kMPerBlock
,
kKPerBlock
},
a_lds_block_strategy
);
constexpr
auto
a_lds_byte
=
a_lds_block
.
get_num_of_byte
();
BDataType
*
p_b_lds
=
shared_memory
.
get_aligned_pointer
(
a_lds_byte
);
// [allow optimization] allow different LDS layouts
constexpr
auto
b_lds_block
=
make_tensor
({
p_b_lds
,
kNPerBlock
,
kKPerBlock
},
b_lds_block_strategy
);
// A/B copy
#if 0
auto a_block_copy = make_copier(a_dram_global,
a_lds_block,
make_tuple(kMPerBlock, kKPerBlock),
make_tuple(id_tile_m * kMPerBlock, 0),
a_block_copy_strategy);
auto b_block_copy = make_copier(b_dram_global,
b_lds_block,
make_tuple(kNPerBlock, kKPerBlock),
make_tuple(id_tile_n * kNPerBlock, 0),
b_block_copy_strategy);
#else
auto
window_a_dram
=
make_window
(
a_dram_global
,
{
MPerTile
,
KPerTile
},
{
id_tile_m
*
MPerTile
,
id_tile_k
*
KPerTile
},
a_dram_window_map_strategy
);
auto
window_a_block
=
make_window
(
a_lds_block
,
{
NPerTile
,
KPerTile
},
{
0
,
0
},
a_lds_window_map_strategy
);
#endif
#if 1
// block GEMM
// operation-based syntax: per-operation solution strategy
auto
block_gemm
=
make_block_gemm
(
a_lds_block
,
b_lds_block
,
block_gemm_strategy
);
#endif
// Distributed C in VGPR
#if 1
// C layout is decided alone
// C should be distributed,
auto
c_vgpr_block
=
make_distributed_tensor
({
kMPerBlock
,
kNPerBlock
},
c_vgpr_block_strategy
);
#elif 0
// C layout is decided by block GEMM
auto
c_vgpr_block
=
block_gemm
.
get_c_vgpr_block
();
#endif
for
(
index_t
k
=
0
;
k
<
K
;
k
+=
kKPerBlock
)
{
auto
a_vgpr_block_tmp
=
load
(
window_a_dram
,
a_dram_load_strategy
);
auto
b_vgpr_block_tmp
=
load
(
window_b_dram
,
b_dram_load_strategy
);
auto
a_vpgr_block
=
elementwise_op
(
a_vgpr_block_tmp
,
a_element_op
);
auto
b_vpgr_block
=
elementwise_op
(
b_vgpr_block_tmp
,
b_element_op
);
copy
(
a_vgpr_block
,
a_lds_block
,
a_lds_store_strategy
);
copy
(
b_vgpr_block
,
b_lds_block
,
b_lds_store_strategy
);
block_sync_lds
();
dot_product_accumulate
(
c_vgpr_block
,
a_lds_block
,
b_lds_block
);
block_sync_lds
();
window_a_dram
+=
{
0
,
kKPerBlock
};
window_b_dram
+=
{
0
,
kKPerBlock
};
}
auto
p_c_lds
=
xxx
;
auto
c_lds
=
make_tensor
(
p_c_lds
,
xxxxxx
);
auto
window_c_vgpr
=
make_window
(
c_vgpr
,
{
kMPerShuffle
,
kNPerShuffle
},
{
0
,
0
},
c_vgpr_window_strategy
);
auto
window_c_lds
=
make_window
(
c_lds
,
{
kMPerShuffle
,
kNPerShuffle
},
{
0
,
0
},
c_lds_window_strategy
);
auto
window_d_dram
=
make_window
(
d_dram_global
,
{
kMPerShuffle
,
kNPerShuffle
},
{
id_tile_m
*
kMPerTile
,
id_tile_n
*
kNPerTile
},
d_dram_window_strategy
);
auto
window_e_dram
=
make_window
(
e_dram_global
,
{
kMPerShuffle
,
kNPerShuffle
},
{
id_tile_m
*
kMPerTile
,
id_tile_n
*
kNPerTile
},
e_dram_window_strategy
);
for
(
m
=
0
;
m
<
kMPerBlock
;
m
+=
kMPerShuffle
)
{
for
(
n
=
0
;
n
<
kNPerBlock
;
n
+=
kNPerShuffle
)
{
// write C into LDS for shuffle
copy
(
window_c_vgpr
,
window_c_lds
,
c_lds_store_strategy
);
// load C from LDS to complete shuffle
auto
c_vgpr_slice_shuffled
=
load
(
window_c_lds
,
c_lds_load_strategy
);
// load D from dram
auto
d_vgpr_block_slice
=
load
(
window_d_dram
,
d_dram_load_strategy
);
// element wise op
// [Question] need to gurantee it always function
// 1. C/D should have same layout, how to gurantee?
// 2. if C/D have different layout, then need to do shuffle
// 3. if C/D have different layout, what should E layout be?
auto
e_vgpr_block_slice
=
elementwise_op
(
c_vgpr_block_slice
,
d_vgpr_block_slice
,
cd_elementwise_op
);
// write E into dram
copy
(
e_vgpr_block_slice
,
window_e_dram
,
e_dram_store_strategy
);
}
}
}
};
example/91_tile_program/hello_world.cpp
View file @
c5b3f69f
...
...
@@ -19,6 +19,8 @@ struct HelloWorld
auto
desc0
=
tp
(
make_naive_tensor_descriptor_packed
(
ck
::
make_tuple
(
x
)));
auto
desc1
=
tp
(
make_naive_tensor_descriptor_packed
(
ck
::
make_tuple
(
y
)));
// only for testing purpose
// cpu should not do work here
res
[
0
]
=
desc0
.
GetLength
(
ck
::
Number
<
0
>
{});
res
[
1
]
=
desc1
.
GetLength
(
ck
::
Number
<
0
>
{});
}
...
...
example/91_tile_program/im2col.cpp
0 → 100644
View file @
c5b3f69f
#include "tile_program.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/io.hpp"
#include "ck/library/utility/device_memory.hpp"
// program
template
<
ck
::
index_t
NDimSpatial
,
typename
ALayout
>
struct
Im2Col
{
__host__
__device__
void
operator
()(
TileProgram
&
tp
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_right_pads
,
//
const
std
::
array
<
index_t
,
2
>
a_gemmg_gemmm_gemmk_lengths
,
const
std
::
array
<
index_t
,
2
>
a_gemmg_gemmm_gemmk_strides
,
//
const
T
*
p_a_img
,
T
*
p_a_mtx
)
{
using
namespace
ck
;
const
auto
a_src_desc
=
tp
(
TransformConvFwdToGemm
<
NDimSpatial
,
ConvolutionForwardSpecialization
::
Default
>::
template
MakeADescriptor_M_K
<
ALayout
>(
a_g_n_c_wis_lengths
,
a_g_n_c_wis_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
c_g_n_k_wos_lengths
,
c_g_n_k_wos_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
));
const
auto
a_dst_desc
=
tp
(
make_naive_tensor_descriptor
(
a_gemmm_gemmk_lengths
,
a_gemmm_gemmk_strides
));
const
auto
a_src
=
make_tensor
(
a_src_desc
,
p_a_img
);
const
auto
a_dst
=
make_tensor
(
a_dst_desc
,
p_a_mtx
);
const
auto
num_gemmg
=
a_gemmg_gemmm_gemmk_c_wis_lengths
[
0
];
const
auto
num_gemmm
=
a_gemmg_gemmm_gemmk_c_wis_lengths
[
1
];
const
auto
num_gemmk
=
a_gemmg_gemmm_gemmk_c_wis_lengths
[
2
];
const
auto
id_block
=
get_block_1d_id
();
const
auto
num_tile_m
=
num_gemmm
/
MPerTile
;
const
auto
num_tile_k
=
num_gemmk
/
KPerTile
;
const
auto
block2tile
=
tp
(
make_cluster_descriptor
(
make_tuple
(
num_tile_m
,
num_tile_k
)));
const
auto
id_tile
=
block2tile
.
CalculateBottonIndex
(
id_block
);
const
auto
id_tile_m
=
id_tile
.
At
<
0
>
();
const
auto
id_tile_k
=
id_tile
.
At
<
1
>
();
#if 0
// operation-based syntax: per-oeration solution strategy
// operation here is data movement
auto copier = make_copier(a_src,
a_dst,
make_tuple(1, MPerTile, KPerTile),
make_tuple(0, id_tile_m * MPerTile, id_tile_k * KPerTile),
copy_strategy);
for(ck::index_t id_gemmg = 0; id_gemmg < num_gemmg; id_gemmg++)
{
copier();
copier.move_src_window(make_tuple(1, 0, 0));
copier.move_dst_window(make_tuple(1, 0, 0));
}
#else
// data-based syntax: per-data solution strategy
auto
window_a_src
=
make_window
(
a_src
,
make_tuple
(
1
,
MPerTile
,
KPerTile
),
make_tuple
(
0
,
id_tile_m
*
MPerTile
,
id_tile_k
*
KPerTile
),
a_src_window_map_strategy
);
auto
window_a_dst
=
make_window
(
a_dst
,
make_tuple
(
1
,
MPerTile
,
KPerTile
),
make_tuple
(
0
,
id_tile_m
*
MPerTile
,
id_tile_k
*
KPerTile
),
a_dst_window_map_strategy
);
for
(
ck
::
index_t
id_gemmg
=
0
;
id_gemmg
<
num_gemmg
;
id_gemmg
++
)
{
copy
(
window_a_src
,
window_a_dst
,
a_copy_strategy
);
window_a_src
+=
make_tuple
(
1
,
0
,
0
);
window_a_dst
+=
make_tuple
(
1
,
0
,
0
);
}
#endif
}
};
int
main
()
{
ck
::
index_t
NumDimSpatial
=
2
;
ck
::
index_t
G
=
32
;
ck
::
index_t
N
=
256
;
ck
::
index_t
K
=
192
;
ck
::
index_t
C
=
192
;
ck
::
index_t
Y
=
3
;
ck
::
index_t
X
=
3
;
ck
::
index_t
Hi
=
28
;
ck
::
index_t
Wi
=
28
;
ck
::
index_t
Ho
=
28
;
ck
::
index_t
Wo
=
28
;
std
::
array
<
ck
::
index_t
,
NumDimSpatial
+
3
>
in_lengths
{
G
,
N
,
Hi
,
Wi
,
C
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
+
3
>
in_strides
{
0
,
0
,
0
,
0
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
+
3
>
wei_lengths
{
G
,
K
,
Y
,
X
,
C
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
+
3
>
wei_strides
{
0
,
0
,
0
,
0
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
+
3
>
out_lengths
{
G
,
N
,
Ho
,
Wo
,
K
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
+
3
>
out_strides
{
0
,
0
,
0
,
0
,
1
};
std
::
partial_sum
(
rbegin
(
in_lengths
),
std
::
prev
(
rend
(
in_lengths
)),
std
::
next
(
rbegin
(
in_strides
)),
std
::
multiplies
<>
{});
std
::
partial_sum
(
rbegin
(
wei_lengths
),
std
::
prev
(
rend
(
wei_lengths
)),
std
::
next
(
rbegin
(
wei_strides
)),
std
::
multiplies
<>
{});
std
::
partial_sum
(
rbegin
(
out_lengths
),
std
::
prev
(
rend
(
out_lengths
)),
std
::
next
(
rbegin
(
out_strides
)),
std
::
multiplies
<>
{});
// transpose GNHWC/GKYXC/GNHWK to GNCHW/GKCYX/GNCHW
std
::
rotate
(
rbegin
(
in_lengths
),
std
::
next
(
rbegin
(
in_lengths
)),
std
::
next
(
rbegin
(
in_lengths
),
3
));
std
::
rotate
(
rbegin
(
in_strides
),
std
::
next
(
rbegin
(
in_strides
)),
std
::
next
(
rbegin
(
in_strides
),
3
));
std
::
rotate
(
rbegin
(
wei_lengths
),
std
::
next
(
rbegin
(
wei_lengths
)),
std
::
next
(
rbegin
(
wei_lengths
),
3
));
std
::
rotate
(
rbegin
(
wei_strides
),
std
::
next
(
rbegin
(
wei_strides
)),
std
::
next
(
rbegin
(
wei_strides
),
3
));
std
::
rotate
(
rbegin
(
out_lengths
),
std
::
next
(
rbegin
(
out_lengths
)),
std
::
next
(
rbegin
(
out_lengths
),
3
));
std
::
rotate
(
rbegin
(
out_strides
),
std
::
next
(
rbegin
(
out_strides
)),
std
::
next
(
rbegin
(
out_strides
),
3
));
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_strides
{
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_dilations
{
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
input_left_pads
{
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
input_right_pads
{
1
,
1
};
// matrix
std
::
array
<
ck
::
index_t
,
NumDimSpatial
+
3
>
in_mtx_lengths
{
G
,
G
*
Ho
*
Wo
,
C
*
Y
*
X
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
+
3
>
in_mtx_strides
{
0
,
0
,
1
};
std
::
partial_sum
(
rbegin
(
in_mtx_lengths
),
std
::
prev
(
rend
(
in_mtx_lengths
)),
std
::
next
(
rbegin
(
in_mtx_strides
)),
std
::
multiplies
<>
{});
DeviceMem
in
(
sizeof
(
InDataType
)
*
G
*
N
*
Hi
*
Wi
*
C
);
DeviceMem
in_mtx
(
sizeof
(
InDataType
)
*
G
*
N
*
Ho
*
Wo
*
C
*
Y
*
X
);
launch
(
HelloWorld
{},
1
,
1
,
in_lengths
,
in_strides
,
wei_lengths
,
wei_strides
,
out_lengths
,
out_strides
,
filter_strides
,
filter_dilations
,
input_left_pads
,
input_right_pads
,
//
in_mtx_lengths
,
in_mtx_strides
,
//
in
.
GetDeviceBuffer
(),
in_mtx
.
GetDeviceBuffer
());
return
0
;
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment