Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
00089cd6
"configs/vscode:/vscode.git/clone" did not exist on "78a2654049624322817fbb592fd61dce78b58822"
Commit
00089cd6
authored
Sep 26, 2019
by
Chao Liu
Browse files
clean up
parent
b12bbcee
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
18 additions
and
263 deletions
+18
-263
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+2
-8
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
...cit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+1
-4
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
+2
-8
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
...cit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+1
-4
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+4
-4
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+4
-4
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
...ation/threadwise_generic_tensor_slice_copy_deprecated.hpp
+2
-229
driver/src/driver.cpp
driver/src/driver.cpp
+2
-2
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
00089cd6
...
...
@@ -396,14 +396,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
0
,
b_thread_data_on_global
,
0
})
#if 0
.Run
#else
// tweaking
.
template
Run_optimized_address_calculation
<
Float
,
address_space_t
::
generic
,
address_space_t
::
global
>
#endif
(
p_out_thread
,
p_out_global
);
.
template
Run
<
Float
,
address_space_t
::
generic
,
address_space_t
::
global
>(
p_out_thread
,
p_out_global
);
}
}
};
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
View file @
00089cd6
...
...
@@ -427,10 +427,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
b_thread_data_on_global
,
0
})
#if 1
.
template
Run_generic
<
Float
,
Float
,
address_space_t
::
generic
,
address_space_t
::
global
>
.
template
Run
<
Float
,
Float
,
address_space_t
::
generic
,
address_space_t
::
global
>
#else // tweaking
.
template
Run_optimized_dst_address_calculation
<
Float
,
Float
,
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
00089cd6
...
...
@@ -391,14 +391,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
for
(
index_t
nrepeat
=
0
;
nrepeat
<
GemmNRepeat
;
++
nrepeat
)
{
threadwise_out_copy
#if 1
.
Run
#else // tweaking
.
template
Run_optimized_address_calculation
<
Float
,
address_space_t
::
generic
,
address_space_t
::
global
>
#endif
(
p_out_thread
,
p_out_global
);
.
template
Run
<
Float
,
address_space_t
::
generic
,
address_space_t
::
global
>(
p_out_thread
,
p_out_global
);
threadwise_out_copy
.
MoveSrcSliceWindow
(
Sequence
<
0
,
0
,
GemmNPerThreadSubC
>
{},
True
);
threadwise_out_copy
.
MoveDstSliceWindow
(
Sequence
<
0
,
0
,
B1
>
{},
True
);
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
View file @
00089cd6
...
...
@@ -390,10 +390,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
b_thread_data_on_global
/
B1
,
b_thread_data_on_global
%
B1
})
#if 1
.
template
Run_generic
<
Float
,
Float
,
address_space_t
::
generic
,
address_space_t
::
global
>
.
template
Run
<
Float
,
Float
,
address_space_t
::
generic
,
address_space_t
::
global
>
#else // tweaking
.
template
Run_optimized_dst_address_calculation
<
Float
,
Float
,
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
00089cd6
...
...
@@ -73,8 +73,8 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__
void
RunLoadThreadBuffer
(
const
BlockSrcData
*
p_block_src
,
ThreadBufferData
*
p_thread_buffer
)
const
{
#if
1
mThreadwiseLoad
.
template
Run
_generic
<
BlockSrcData
,
#if
0
mThreadwiseLoad.template Run<BlockSrcData,
ThreadBufferData,
BlockSrcAddressSpace,
ThreadBufferAddressSpace>(p_block_src,
...
...
@@ -95,8 +95,8 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__
void
RunStoreThreadBuffer
(
const
ThreadBufferData
*
p_thread_buffer
,
BlockDstData
*
p_block_dst
)
const
{
#if
1
mThreadwiseStore
.
template
Run
_generic
<
ThreadBufferData
,
#if
0
mThreadwiseStore.template Run<ThreadBufferData,
BlockDstData,
ThreadBufferAddressSpace,
BlockDstAddressSpace>(p_thread_buffer, p_block_dst);
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
00089cd6
...
...
@@ -78,7 +78,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
typename
DstData
,
address_space_t
SrcAddressSpace
=
address_space_t
::
generic
,
address_space_t
DstAddressSpace
=
address_space_t
::
generic
>
__device__
void
Run
_generic
(
const
SrcData
*
p_src
,
DstData
*
p_dst
)
const
__device__
void
Run
(
const
SrcData
*
p_src
,
DstData
*
p_dst
)
const
{
using
src_vector_t
=
typename
vector_type
<
SrcData
,
SrcDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
DstData
,
DstDataPerAccess
>::
MemoryType
;
...
...
@@ -130,7 +130,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
__buffer_load
<
SrcData
,
SrcDataPerAccess
>
(
p_src
,
src_coord
.
GetOffset
()
,
0
);
p_src
,
0
,
src_coord
.
GetOffset
());
#else
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_coord
.
GetOffset
()]);
...
...
@@ -172,8 +172,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
__buffer_store
<
DstData
,
DstDataPerAccess
>
(
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]),
p_dst
,
dst_coord
.
GetOffset
()
,
0
);
0
,
dst_coord
.
GetOffset
()
);
#else
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_coord
.
GetOffset
()])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]);
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
View file @
00089cd6
...
...
@@ -538,235 +538,10 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
}
};
template
<
typename
TData
>
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
constexpr
auto
buffer_desc
=
make_ConstantTensorDescriptor_packed
(
SliceLengths
{});
TData
p_buffer_
[
buffer_desc
.
GetElementSpace
()];
TData
*
p_buffer
=
p_buffer_
;
// copy data from src into buffer
{
using
src_vector_t
=
typename
vector_type
<
TData
,
SrcDataPerAccess
>::
MemoryType
;
constexpr
auto
src_vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
constexpr
auto
src_access_lengths
=
SliceLengths
::
Modify
(
src_vector_access_dim
,
SliceLengths
::
Get
(
src_vector_access_dim
)
/
src_data_per_access
);
// Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t
// normal dimensions is known at compile time.
// Below is a hack to isolate merged dimension id from normal dimension id, so the
// corresponding offset can be calculated seperately at run-time and compile-time.
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// merged dimensions, and has value = 1 on normal dimensions;
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// normal dimensions, and has value = 1 on merged dimensions;
constexpr
auto
src_merged_dim_access_lengths
=
typename
sequence_gen
<
nDim
,
IsolateMergedDimLengths
<
SrcDesc
,
decltype
(
src_access_lengths
)
>>::
type
{};
constexpr
auto
src_normal_dim_access_lengths
=
src_access_lengths
+
Number
<
1
>
{}
-
src_merged_dim_access_lengths
;
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
// offset w.r.t. merged dimension need to be computed at run-time
static_ford
<
decltype
(
src_merged_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_merged_dim_access_id_
)
{
constexpr
auto
src_merged_dim_access_id
=
decltype
(
src_merged_dim_access_id_
){};
constexpr
auto
src_merged_dim_data_id
=
src_merged_dim_access_id
.
Modify
(
src_vector_access_dim
,
src_merged_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
);
const
TData
*
p_src_tmp
=
p_src
+
(
mSrcSliceOrigin
+
src_merged_dim_data_id
).
GetOffset
();
// offset w.r.t. normal dimension can be computed at compile-time
static_ford
<
decltype
(
src_normal_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_normal_dim_access_id_
)
{
constexpr
auto
src_normal_dim_access_id
=
decltype
(
src_normal_dim_access_id_
){};
constexpr
auto
src_normal_dim_data_id
=
src_normal_dim_access_id
.
Modify
(
src_vector_access_dim
,
src_normal_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
);
constexpr
index_t
src_normal_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_normal_dim_data_id
);
// load vector from src
const
src_vector_t
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src_tmp
[
src_normal_offset
]);
// unpack vector into buffer
static_for
<
0
,
SrcDataPerAccess
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
src_vector_access_dim
,
i
);
constexpr
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
src_merged_dim_data_id
+
src_normal_dim_data_id
+
scalar_id
);
p_buffer
[
buffer_offset
]
=
reinterpret_cast
<
const
TData
*>
(
&
vector_data
)[
i
];
});
});
});
#else
ford
<
decltype
(
src_merged_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_merged_dim_access_id
)
{
auto
src_merged_dim_data_id
=
src_merged_dim_access_id
;
src_merged_dim_data_id
(
src_vector_access_dim
)
=
src_merged_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
;
const
TData
*
p_src_tmp
=
p_src
+
(
mSrcSliceOrigin
+
src_merged_dim_data_id
).
GetOffset
();
// these should be compile-time known
ford
<
decltype
(
src_normal_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_normal_dim_access_id
)
{
auto
src_normal_dim_data_id
=
src_normal_dim_access_id
;
src_normal_dim_data_id
(
src_vector_access_dim
)
=
src_normal_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
;
const
index_t
src_normal_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_normal_dim_data_id
);
// load vector from src
const
src_vector_t
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src_tmp
[
src_normal_offset
]);
// unpack vector into buffer
for
(
index_t
i
=
0
;
i
<
SrcDataPerAccess
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
src_vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
src_merged_dim_data_id
+
src_normal_dim_data_id
+
scalar_id
);
p_buffer
[
buffer_offset
]
=
reinterpret_cast
<
const
TData
*>
(
&
vector_data
)[
i
];
}
});
});
#endif
}
// copy data from buffer into dst
{
using
dst_vector_t
=
typename
vector_type
<
TData
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
dst_access_lengths
=
SliceLengths
::
Modify
(
dst_vector_access_dim
,
SliceLengths
::
Get
(
dst_vector_access_dim
)
/
dst_data_per_access
);
constexpr
auto
dst_merged_dim_access_lengths
=
typename
sequence_gen
<
nDim
,
IsolateMergedDimLengths
<
DstDesc
,
decltype
(
dst_access_lengths
)
>>::
type
{};
constexpr
auto
dst_normal_dim_access_lengths
=
dst_access_lengths
+
Number
<
1
>
{}
-
dst_merged_dim_access_lengths
;
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
// offset w.r.t. merged dimension need to be computed at run-time
static_ford
<
decltype
(
dst_merged_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_merged_dim_access_id_
)
{
constexpr
auto
dst_merged_dim_access_id
=
decltype
(
dst_merged_dim_access_id_
){};
constexpr
auto
dst_merged_dim_data_id
=
dst_merged_dim_access_id
.
Modify
(
dst_vector_access_dim
,
dst_merged_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
);
TData
*
p_dst_tmp
=
p_dst
+
(
mDstSliceOrigin
+
dst_merged_dim_data_id
).
GetOffset
();
// offset w.r.t. normal dimension can be computed at compile-time
static_ford
<
decltype
(
dst_normal_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_normal_dim_access_id_
)
{
constexpr
auto
dst_normal_dim_access_id
=
decltype
(
dst_normal_dim_access_id_
){};
constexpr
auto
dst_normal_dim_data_id
=
dst_normal_dim_access_id
.
Modify
(
dst_vector_access_dim
,
dst_normal_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
);
dst_vector_t
vector_data
;
// pack vector from buffer
static_for
<
0
,
DstDataPerAccess
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
dst_vector_access_dim
,
i
);
constexpr
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_merged_dim_data_id
+
dst_normal_dim_data_id
+
scalar_id
);
reinterpret_cast
<
TData
*>
(
&
vector_data
)[
i
]
=
p_buffer
[
buffer_offset
];
});
constexpr
index_t
dst_normal_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_normal_dim_data_id
);
// write vector into dst
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_tmp
[
dst_normal_offset
])
=
vector_data
;
});
});
#else
// offset w.r.t. merged dimension need to be computed at run-time
ford
<
decltype
(
dst_merged_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_merged_dim_access_id
)
{
auto
dst_merged_dim_data_id
=
dst_merged_dim_access_id
;
dst_merged_dim_data_id
(
dst_vector_access_dim
)
=
dst_merged_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
;
TData
*
p_dst_tmp
=
p_dst
+
(
mDstSliceOrigin
+
dst_merged_dim_data_id
).
GetOffset
();
// offset w.r.t. normal dimension can be computed at compile-time
ford
<
decltype
(
dst_normal_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_normal_dim_access_id
)
{
auto
dst_normal_dim_data_id
=
dst_normal_dim_access_id
;
dst_normal_dim_data_id
(
dst_vector_access_dim
)
=
dst_normal_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
;
dst_vector_t
vector_data
;
// pack vector from buffer
for
(
index_t
i
=
0
;
i
<
DstDataPerAccess
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
dst_vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_merged_dim_data_id
+
dst_normal_dim_data_id
+
scalar_id
);
reinterpret_cast
<
TData
*>
(
&
vector_data
)[
i
]
=
p_buffer
[
buffer_offset
];
}
const
index_t
dst_normal_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_normal_dim_data_id
);
// write vector into dst
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_tmp
[
dst_normal_offset
])
=
vector_data
;
});
});
#endif
}
}
template
<
typename
TData
,
address_space_t
SrcAddressSpace
=
address_space_t
::
generic
,
address_space_t
DstAddressSpace
=
address_space_t
::
generic
>
__device__
void
Run
_optimized_address_calculation
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
constexpr
auto
buffer_desc
=
make_ConstantTensorDescriptor_packed
(
SliceLengths
{});
...
...
@@ -841,9 +616,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
static_if
<
SrcAddressSpace
==
address_space_t
::
global
>
{}([
&
](
auto
)
{
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
vector_data
=
__buffer_load
<
TData
,
SrcDataPerAccess
>
(
p_src
,
static_cast
<
uint32_t
>
(
src_merged_offset
),
static_cast
<
uint32_t
>
(
src_normal_offset
));
p_src
,
src_merged_offset
,
src_normal_offset
);
#else
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_normal_offset
+
src_merged_offset
]);
...
...
driver/src/driver.cpp
View file @
00089cd6
...
...
@@ -448,7 +448,7 @@ int main(int argc, char* argv[])
ConvStrides
{},
ConvDilations
{},
nrepeat
);
#elif
0
#elif
1
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
...
...
@@ -490,7 +490,7 @@ int main(int argc, char* argv[])
ConvStrides
{},
ConvDilations
{},
nrepeat
);
#elif
1
#elif
0
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment