Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
8ce14804
"docs/git@developer.sourcefind.cn:change/sglang.git" did not exist on "743007e1ce07b99529b49d95413f4879853be1ac"
Commit
8ce14804
authored
May 23, 2019
by
Chao Liu
Browse files
refactor
parent
1cc683a3
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
49 additions
and
108 deletions
+49
-108
driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
...er/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
+4
-4
src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
...dwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
+45
-104
No files found.
driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
View file @
8ce14804
...
@@ -79,8 +79,8 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
...
@@ -79,8 +79,8 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockCopySubLengths_
N1_N2_C_B
=
Sequence
<
1
,
4
,
1
,
1
>
;
using
InBlockCopySubLengths_
C_N1_B_N2
=
Sequence
<
1
,
1
,
1
,
4
>
;
using
InBlockCopyClusterLengths_
N1_N2_C_B
=
Sequence
<
2
,
1
,
8
,
1
6
>
;
using
InBlockCopyClusterLengths_
C_N1_B_N2
=
Sequence
<
8
,
2
,
16
,
1
>
;
constexpr
index_t
InBlockCopySrcDataPerRead_B
=
1
;
constexpr
index_t
InBlockCopySrcDataPerRead_B
=
1
;
constexpr
index_t
InBlockCopyDstDataPerWrite_N2
=
4
;
constexpr
index_t
InBlockCopyDstDataPerWrite_N2
=
4
;
...
@@ -122,8 +122,8 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
...
@@ -122,8 +122,8 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
GemmKPerThreadLoop
,
GemmKPerThreadLoop
,
GemmDataPerReadA
,
GemmDataPerReadA
,
GemmDataPerReadB
,
GemmDataPerReadB
,
InBlockCopySubLengths_
N1_N2_C_B
,
InBlockCopySubLengths_
C_N1_B_N2
,
InBlockCopyClusterLengths_
N1_N2_C_B
,
InBlockCopyClusterLengths_
C_N1_B_N2
,
InBlockCopySrcDataPerRead_B
,
InBlockCopySrcDataPerRead_B
,
InBlockCopyDstDataPerWrite_N2
,
InBlockCopyDstDataPerWrite_N2
,
WeiBlockCopySubLengths_C_K
,
WeiBlockCopySubLengths_C_K
,
...
...
src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
View file @
8ce14804
...
@@ -28,8 +28,8 @@ template <index_t GridSize,
...
@@ -28,8 +28,8 @@ template <index_t GridSize,
index_t
GemmKPerThreadLoop
,
index_t
GemmKPerThreadLoop
,
index_t
GemmDataPerReadA
,
index_t
GemmDataPerReadA
,
index_t
GemmDataPerReadB
,
index_t
GemmDataPerReadB
,
class
InBlockCopySubLengths_
N1_N2_C_B
,
class
InBlockCopySubLengths_
C_N1_B_N2
,
class
InBlockCopyClusterLengths_
N1_N2_C_B
,
class
InBlockCopyClusterLengths_
C_N1_B_N2
,
index_t
InBlockCopySrcDataPerRead_B
,
index_t
InBlockCopySrcDataPerRead_B
,
index_t
InBlockCopyDstDataPerWrite_N2
,
index_t
InBlockCopyDstDataPerWrite_N2
,
class
WeiBlockCopySubLengths_C_K
,
class
WeiBlockCopySubLengths_C_K
,
...
@@ -101,26 +101,20 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
...
@@ -101,26 +101,20 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
constexpr
auto
in_n0_n1_n2_c_h_w_global_mem_desc
=
constexpr
auto
in_n0_n1_n2_c_h_w_global_mem_desc
=
in_n_c_h_w_global_desc
.
Fold
(
I0
,
Number
<
N1
>
{},
Number
<
N2
>
{});
in_n_c_h_w_global_desc
.
Fold
(
I0
,
Number
<
N1
>
{},
Number
<
N2
>
{});
// merged tensor descriptor in device memory [N1, N2, C, B], src of blockwise copy
// merged tensor descriptor in device memory [C, N1, B, N2], src of blockwise copy
constexpr
auto
in_n1_n2_c_b_global_merged_desc
=
make_ConstantMergedTensorDescriptor
(
constexpr
auto
in_c_n1_b_n2_global_merged_desc
=
make_ConstantMergedTensorDescriptor
(
in_n0_n1_n2_c_h_w_global_mem_desc
.
ReorderGivenNew2Old
(
Sequence
<
1
,
2
,
3
,
0
,
4
,
5
>
{})
in_n0_n1_n2_c_h_w_global_mem_desc
.
Slice
(
I4
,
Number
<
Ho
>
{}).
Slice
(
I5
,
Number
<
Wo
>
{}),
.
Slice
(
I4
,
Number
<
Ho
>
{})
Sequence
<
3
>
{},
.
Slice
(
I5
,
Number
<
Wo
>
{}),
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
0
,
4
,
5
>
{},
Sequence
<
3
,
4
,
5
>
{});
Sequence
<
2
>
{});
// memory layout descriptor in LDS [C, N1, B, N2]
// memory layout descriptor in LDS [C, N1, B, N2]
, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
in_c_n1_b_n2_block_mem_desc
=
constexpr
auto
in_c_n1_b_n2_block_mem_desc
=
make_ConstantTensorDescriptor_default_rank_aligned
(
make_ConstantTensorDescriptor_default_rank_aligned
(
Sequence
<
CPerBlock
,
N1
,
BPerBlock
,
N2
>
{},
Number
<
InBlockCopyDstDataPerWrite_N2
>
{});
Sequence
<
CPerBlock
,
N1
,
BPerBlock
,
N2
>
{},
Number
<
InBlockCopyDstDataPerWrite_N2
>
{});
// tensor descriptor in LDS [N1, N2, C, B], dst of blockwise copy
constexpr
auto
in_n1_n2_c_b_block_desc
=
in_c_n1_b_n2_block_mem_desc
.
ReorderGivenNew2Old
(
Sequence
<
1
,
3
,
0
,
2
>
{});
// this check is ad-hoc
// this check is ad-hoc
// TODO: need to properly implement tensor descriptor with alignment
// TODO: need to properly implement tensor descriptor with alignment
static_assert
(
in_c_n1_b_n2_block_mem_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
static_assert
(
in_c_n1_b_n2_block_mem_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
...
@@ -132,16 +126,16 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
...
@@ -132,16 +126,16 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
const
auto
blockwise_in_copy
=
BlockwiseTensorSliceCopy_generic_v1
<
const
auto
blockwise_in_copy
=
BlockwiseTensorSliceCopy_generic_v1
<
BlockSize
,
BlockSize
,
Float
,
Float
,
decltype
(
in_
n1_n2_c_b
_global_merged_desc
),
decltype
(
in_
c_n1_b_n2
_global_merged_desc
),
decltype
(
in_n1_n2
_c_b
_block_desc
),
decltype
(
in_
c_
n1_
b_
n2_block_
mem_
desc
),
decltype
(
in_n1_n2
_c_b
_block_desc
.
GetLengths
()),
decltype
(
in_
c_
n1_
b_
n2_block_
mem_
desc
.
GetLengths
()),
InBlockCopySubLengths_
N1_N2_C_B
,
InBlockCopySubLengths_
C_N1_B_N2
,
InBlockCopyClusterLengths_
N1_N2_C_B
,
InBlockCopyClusterLengths_
C_N1_B_N2
,
Sequence
<
2
,
0
,
1
,
3
>
,
// thread_arrange_order [C, N1, N2, B]
Sequence
<
0
,
1
,
3
,
2
>
,
// thread_arrange_order [C, N1, N2, B]
Sequence
<
0
,
1
,
2
,
3
>
,
// src_access_order [N1, N2, C, B]
Sequence
<
1
,
3
,
0
,
2
>
,
// src_access_order [N1, N2, C, B]
Sequence
<
2
,
0
,
3
,
1
>
,
// dst_access_order [C, N1, B, N2]
Sequence
<
0
,
1
,
2
,
3
>
,
// dst_access_order [C, N1, B, N2]
InBlockCopySrcDataPerRead_B
,
InBlockCopySrcDataPerRead_B
,
InBlockCopyDstDataPerWrite_N2
>
({
0
,
0
,
0
,
b_block_data_on_global
},
{
0
,
0
,
0
,
0
});
InBlockCopyDstDataPerWrite_N2
>
({
0
,
0
,
b_block_data_on_global
,
0
},
{
0
,
0
,
0
,
0
});
// weight tensor
// weight tensor
// tensor descriptor in device memory, src of blockwise copy
// tensor descriptor in device memory, src of blockwise copy
...
@@ -154,7 +148,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
...
@@ -154,7 +148,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerAccess_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerAccess_K
,
GemmDataPerReadA
)
>
{});
// operator for blockwise copy of weight into LDS
// operator for blockwise copy of weight into LDS
// slic
ing a
tensor
// slic
e a tensor, and copy it into another
tensor
// this copy operator already have blockwise offset built-in
// this copy operator already have blockwise offset built-in
const
auto
blockwise_wei_copy
=
const
auto
blockwise_wei_copy
=
BlockwiseTensorSliceCopy_generic_v1
<
BlockSize
,
BlockwiseTensorSliceCopy_generic_v1
<
BlockSize
,
...
@@ -252,16 +246,12 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
...
@@ -252,16 +246,12 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
p_in_block_on_global
+=
CPerBlock
*
in_n_c_h_w_global_desc
.
GetStride
(
I1
),
p_in_block_on_global
+=
CPerBlock
*
in_n_c_h_w_global_desc
.
GetStride
(
I1
),
p_wei_block_on_global
+=
CPerBlock
*
wei_c_y_x_k_global_desc
.
GetStride
(
I0
))
p_wei_block_on_global
+=
CPerBlock
*
wei_c_y_x_k_global_desc
.
GetStride
(
I0
))
{
{
#if 1 // debug
blockwise_in_copy
.
Run
(
p_in_block_on_global
,
p_in_block
);
blockwise_in_copy
.
Run
(
p_in_block_on_global
,
p_in_block
);
blockwise_wei_copy
.
Run
(
p_wei_block_on_global
,
p_wei_block
);
blockwise_wei_copy
.
Run
(
p_wei_block_on_global
,
p_wei_block
);
#endif
__syncthreads
();
__syncthreads
();
#if 1 // debug
blockwise_gemm
.
Run
(
p_wei_block
,
p_in_block
,
p_out_thread
);
blockwise_gemm
.
Run
(
p_wei_block
,
p_in_block
,
p_out_thread
);
#endif
__syncthreads
();
__syncthreads
();
}
}
...
@@ -275,104 +265,55 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
...
@@ -275,104 +265,55 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
constexpr
index_t
K0
=
K
/
(
K1
*
K2
);
constexpr
index_t
K0
=
K
/
(
K1
*
K2
);
// define tensor descriptor for threadwise copy
// define tensor descriptor for threadwise copy
// output tensor (also, memory layout) descriptor in register, src of threadwise
// output memory layout descriptor in register
// copy
constexpr
auto
out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc
=
constexpr
auto
out_k0_k1_k2_n1_b_n2_thread_mem_desc
=
make_ConstantTensorDescriptor_default_rank_packed
(
make_ConstantTensorDescriptor_default_rank_packed
(
Sequence
<
KPerBlock
/
(
K1
*
K2
),
1
,
K2
,
N1
,
1
,
N2
>
{});
Sequence
<
KPerBlock
/
(
K1
*
K2
),
1
,
K2
,
N1
,
1
,
1
,
1
,
N2
>
{});
// output tensor descriptor in register, src of threadwise copy
constexpr
auto
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
=
out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc
.
ReorderGivenNew2Old
(
Sequence
<
4
,
3
,
7
,
0
,
1
,
2
,
5
,
6
>
{});
// output memory layout descriptor in device memory
// output memory layout descriptor in device memory
, dst of threadwise copy
constexpr
auto
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
=
constexpr
auto
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
=
out_n_k_h_w_global_desc
.
Fold
(
I1
,
Number
<
K1
>
{},
Number
<
K2
>
{})
out_n_k_h_w_global_desc
.
Fold
(
I1
,
Number
<
K1
>
{},
Number
<
K2
>
{})
.
Fold
(
I0
,
Number
<
N1
>
{},
Number
<
N2
>
{});
.
Fold
(
I0
,
Number
<
N1
>
{},
Number
<
N2
>
{});
// output merged tensor descriptor in device memory, dst of threadwise copy
constexpr
auto
out_k0_k1_k2_n1_b_n2_global_merged_desc
=
make_ConstantMergedTensorDescriptor
(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.
ReorderGivenNew2Old
(
Sequence
<
3
,
4
,
5
,
1
,
0
,
6
,
7
,
2
>
{}),
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
,
5
,
6
>
{},
Sequence
<
7
>
{});
// calculate origin of thread output tensor on global memory
// calculate origin of thread output tensor on global memory
// blockwise GEMM c matrix starting index
// blockwise GEMM c matrix starting index
const
auto
c_thread_mtx_on_block
=
const
auto
c_thread_mtx_on_block
=
blockwise_gemm
.
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
blockwise_gemm
.
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
// origin of thread tensor on global
const
index_t
k_thread_data_on_global
=
const
index_t
k_thread_data_on_global
=
k_block_data_on_global
+
c_thread_mtx_on_block
.
row
;
k_block_data_on_global
+
c_thread_mtx_on_block
.
row
;
const
index_t
b_thread_data_on_global
=
const
index_t
b_thread_data_on_global
=
b_block_data_on_global
+
c_thread_mtx_on_block
.
col
/
N2
;
b_block_data_on_global
+
c_thread_mtx_on_block
.
col
/
N2
;
// output merged global tensor descriptor, for calculating origin of thread tensor
// output merged global tensor descriptor, for calculating origin of thread tensor
// in global memory
// in global memory
#if 0 // unfold a merged tensor is not implemented yet
constexpr auto out_k_n1_b_n2_global_merged_desc =
out_k0_k1_k2_n1_b_n2_global_merged_desc.Unfold(I0, I2);
#else
constexpr
auto
out_k_n1_b_n2_global_merged_desc
=
make_ConstantMergedTensorDescriptor
(
constexpr
auto
out_k_n1_b_n2_global_merged_desc
=
make_ConstantMergedTensorDescriptor
(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.
Unfold
(
I3
,
I5
),
.
ReorderGivenNew2Old
(
Sequence
<
3
,
4
,
5
,
1
,
0
,
6
,
7
,
2
>
{})
Sequence
<
3
>
{},
.
Unfold
(
I0
,
I2
),
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
,
4
>
{},
Sequence
<
0
,
4
,
5
>
{},
Sequence
<
5
>
{});
Sequence
<
2
>
{});
#endif
// origin of
thread tensor in global
memory
// origin of
dst in device
memory
Float
*
p_out_thread_on_global
=
Float
*
p_out_thread_on_global
=
p_out_global
+
p_out_global
+
out_k_n1_b_n2_global_merged_desc
.
GetOffsetFromMultiIndex
(
out_k_n1_b_n2_global_merged_desc
.
GetOffsetFromMultiIndex
(
k_thread_data_on_global
,
0
,
0
,
0
);
// dst origin on merged global tensor
k_thread_data_on_global
,
0
,
b_thread_data_on_global
,
0
);
threadwise_tensor_slice_copy_generic
(
threadwise_tensor_slice_copy_generic
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
,
out_k0_k1_k2_n1_b_n2_thread_mem_desc
,
// src thread tensor (in register) descriptor
p_out_thread
,
p_out_thread
,
// origin of src
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
},
{
0
,
0
,
0
,
0
,
0
,
0
},
// starting point of slice, w.r.t. origin of src
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
,
out_k0_k1_k2_n1_b_n2_global_merged_desc
,
// dst global merged tensor (in device mem)
p_out_thread_on_global
,
// descriptor
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
},
p_out_thread_on_global
,
// origin of dst
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
.
GetLengths
(),
{
0
,
arithmetic_sequence_gen
<
0
,
8
,
1
>::
SeqType
{});
0
,
0
,
0
,
b_thread_data_on_global
,
0
},
// starting point of slice w.r.t. origin of dst
out_k0_k1_k2_n1_b_n2_thread_mem_desc
.
GetLengths
(),
// slice lengths
Sequence
<
3
,
5
,
0
,
1
,
2
,
4
>
{}
// dimension access order [n1, n2, k0, k1, k2, b]
);
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_n0_n1_n2_c_h_w_global_mem_desc,
"in_n0_n1_n2_c_h_w_global_mem_desc");
print_ConstantMergedTensorDescriptor(in_n1_n2_c_b_global_merged_desc,
"in_n1_n2_c_b_global_merged_desc");
print_ConstantTensorDescriptor(in_c_n1_b_n2_block_mem_desc,
"in_c_n1_b_n2_block_mem_desc");
print_ConstantTensorDescriptor(in_n1_n2_c_b_block_desc, "in_n1_n2_c_b_block_desc");
print_ConstantTensorDescriptor(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc,
"out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc");
print_ConstantMergedTensorDescriptor(out_k_n1_b_n2_global_merged_desc,
"out_k_n1_b_n2_global_merged_desc");
print_ConstantTensorDescriptor(out_k0_k1_k2_n1_b_n2_thread_mem_desc,
"out_k0_k1_k2_n1_b_n2_thread_mem_desc");
}
#endif
}
}
}
}
};
};
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment