Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b808713a
Commit
b808713a
authored
Aug 06, 2020
by
Chao Liu
Browse files
prototype dynamic descriptor
parent
d3405258
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
23 additions
and
320 deletions
+23
-320
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
...rnel/include/kernel_algorithm/dummy_dynamic_transform.hpp
+15
-90
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
...lude/tensor_description/dynamic_multi_index_transform.hpp
+6
-205
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
...e/tensor_description/dynamic_tensor_descriptor_helper.hpp
+0
-15
driver/include/device_dummy_dynamic_transform.hpp
driver/include/device_dummy_dynamic_transform.hpp
+2
-10
No files found.
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
View file @
b808713a
...
@@ -12,73 +12,6 @@ namespace ck {
...
@@ -12,73 +12,6 @@ namespace ck {
template
<
index_t
BlockSize
>
template
<
index_t
BlockSize
>
struct
DummyDynamicTransform
struct
DummyDynamicTransform
{
{
__device__
void
Run_
(
index_t
*
const
__restrict__
p_wei_global
,
index_t
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_out_global
,
const
DynamicNativeTensorDescriptor
<
4
>
wei_k_c_y_x_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
in_n_c_hi_wi_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
out_n_k_ho_wo_global_desc
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_right_pads
,
index_t
k_block_num
,
index_t
c_block_num
,
index_t
y_block_num
,
index_t
x_block_num
)
const
{
const
index_t
N
=
in_n_c_hi_wi_global_desc
.
GetLength
(
0
);
const
index_t
C
=
in_n_c_hi_wi_global_desc
.
GetLength
(
1
);
const
index_t
Hi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
2
);
const
index_t
Wi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
3
);
const
index_t
K
=
out_n_k_ho_wo_global_desc
.
GetLength
(
1
);
const
index_t
Ho
=
out_n_k_ho_wo_global_desc
.
GetLength
(
2
);
const
index_t
Wo
=
out_n_k_ho_wo_global_desc
.
GetLength
(
3
);
const
index_t
Y
=
wei_k_c_y_x_global_desc
.
GetLength
(
2
);
const
index_t
X
=
wei_k_c_y_x_global_desc
.
GetLength
(
3
);
const
index_t
ConvStrideH
=
conv_strides
[
0
];
const
index_t
ConvStrideW
=
conv_strides
[
1
];
const
index_t
ConvDilationH
=
conv_dilations
[
0
];
const
index_t
ConvDilationW
=
conv_dilations
[
1
];
p_wei_global
[
0
]
=
wei_k_c_y_x_global_desc
.
GetElementSize
();
p_wei_global
[
1
]
=
wei_k_c_y_x_global_desc
.
GetElementSpace
();
const
index_t
k_block_num_stride
=
c_block_num
*
y_block_num
*
x_block_num
;
const
index_t
c_block_num_stride
=
y_block_num
*
x_block_num
;
const
index_t
y_block_num_stride
=
x_block_num
;
index_t
tmp
=
get_block_1d_id
();
#if 0
const index_t k_block = tmp / k_block_num_stride;
tmp -= k_block * k_block_num_stride;
const index_t c_block = tmp / c_block_num_stride;
tmp -= c_block * c_block_num_stride;
const index_t y_block = tmp / y_block_num_stride;
tmp -= y_block * y_block_num_stride;
const index_t x_block = tmp;
#else
const
index_t
k_block
=
__llvm_amdgcn_readfirstlane_i32
(
tmp
/
k_block_num_stride
);
tmp
-=
k_block
*
k_block_num_stride
;
const
index_t
c_block
=
__llvm_amdgcn_readfirstlane_i32
(
tmp
/
c_block_num_stride
);
tmp
-=
c_block
*
c_block_num_stride
;
const
index_t
y_block
=
__llvm_amdgcn_readfirstlane_i32
(
tmp
/
y_block_num_stride
);
tmp
-=
y_block
*
y_block_num_stride
;
const
index_t
x_block
=
__llvm_amdgcn_readfirstlane_i32
(
tmp
);
#endif
const
index_t
k_thread
=
p_in_global
[
get_thread_local_1d_id
()];
const
index_t
c_thread
=
p_in_global
[
get_thread_local_1d_id
()
+
1
];
const
index_t
y_thread
=
p_in_global
[
get_thread_local_1d_id
()
+
2
];
const
index_t
x_thread
=
p_in_global
[
get_thread_local_1d_id
()
+
3
];
p_wei_global
[
3
]
=
wei_k_c_y_x_global_desc
.
CalculateOffset
(
{
k_block
+
k_thread
,
c_block
+
c_thread
,
y_block
+
y_thread
,
x_block
+
x_thread
});
}
__device__
void
Run
(
index_t
*
const
__restrict__
p_wei_global
,
__device__
void
Run
(
index_t
*
const
__restrict__
p_wei_global
,
index_t
*
const
__restrict__
p_in_global
,
index_t
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_out_global
,
float
*
const
__restrict__
p_out_global
,
...
@@ -88,11 +21,7 @@ struct DummyDynamicTransform
...
@@ -88,11 +21,7 @@ struct DummyDynamicTransform
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_right_pads
,
const
Array
<
index_t
,
2
>
in_right_pads
)
const
index_t
,
index_t
,
index_t
,
index_t
)
const
{
{
#if 1
#if 1
const
index_t
N
=
in_n_c_hi_wi_global_desc
.
GetLength
(
0
);
const
index_t
N
=
in_n_c_hi_wi_global_desc
.
GetLength
(
0
);
...
@@ -189,7 +118,7 @@ struct DummyDynamicTransform
...
@@ -189,7 +118,7 @@ struct DummyDynamicTransform
const
index_t
&
idx_low_bound
)
{
const
index_t
&
idx_low_bound
)
{
index_t
idx_low_tmp
=
idx_low_old
+
carry
+
idx_low_diff_const
;
index_t
idx_low_tmp
=
idx_low_old
+
carry
+
idx_low_diff_const
;
#if
0
// positive
#if
1
// positive
bool
do_carry
=
idx_low_tmp
>=
idx_low_bound
;
bool
do_carry
=
idx_low_tmp
>=
idx_low_bound
;
index_t
idx_low_new
=
do_carry
?
idx_low_tmp
-
idx_low_bound
:
idx_low_tmp
;
index_t
idx_low_new
=
do_carry
?
idx_low_tmp
-
idx_low_bound
:
idx_low_tmp
;
...
@@ -389,23 +318,6 @@ struct DummyDynamicTransform
...
@@ -389,23 +318,6 @@ struct DummyDynamicTransform
Y
,
Y
,
X
);
X
);
#if 0
// Merge(N, Ho, Wo) => GemmN
f_lower_idx_diff_merge(idx_diff[9],
idx_diff[12],
idx_diff[14],
idx_diff[16],
idx[9],
idx[12],
idx[14],
const_tmp[3],
const_tmp[4],
const_tmp[5],
N,
Ho,
Wo);
#endif
// stage 2
// stage 2
// PassThrough(N) => N
// PassThrough(N) => N
f_lower_idx_diff_passthrough
(
idx_diff
[
5
],
idx_diff
[
9
]);
f_lower_idx_diff_passthrough
(
idx_diff
[
5
],
idx_diff
[
9
]);
...
@@ -496,6 +408,19 @@ struct DummyDynamicTransform
...
@@ -496,6 +408,19 @@ struct DummyDynamicTransform
out_n_k_ho_wo_global_desc
.
GetElementSpace
());
out_n_k_ho_wo_global_desc
.
GetElementSpace
());
}
}
}
}
__device__
void
Run_
(
index_t
*
const
__restrict__
p_wei_global
,
index_t
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_out_global
,
const
DynamicNativeTensorDescriptor
<
4
>
wei_k_c_y_x_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
in_n_c_hi_wi_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
out_n_k_ho_wo_global_desc
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_right_pads
)
const
{
}
};
};
}
// namespace ck
}
// namespace ck
...
...
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
View file @
b808713a
...
@@ -5,217 +5,18 @@
...
@@ -5,217 +5,18 @@
namespace
ck
{
namespace
ck
{
struct
Dynamic
PassThrough
class
Dynamic
Transformation
{
{
using
LowerIndex
=
MultiIndex
<
1
>
;
using
UpperIndex
=
MultiIndex
<
1
>
;
index_t
low_length_
;
__host__
__device__
constexpr
DynamicPassThrough
(
index_t
low_length
)
:
low_length_
(
low_length
)
{
}
__host__
__device__
static
constexpr
auto
GetNumOfLowerDimension
()
{
return
Number
<
1
>
{};
}
__host__
__device__
static
constexpr
auto
GetNumOfUpperDimension
()
{
return
Number
<
1
>
{};
}
__host__
__device__
static
constexpr
auto
GetUpperLengths
()
{
return
Sequence
<
Length
>
{};
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
return
idx_up
;
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
return
idx_up_diff
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
{
return
true
;
}
};
};
template
<
index_t
N
DimLow
>
template
<
index_t
N
>
struct
DynamicMerge
class
DynamicEmbed
:
public
DynamicTransformation
{
{
static
constexpr
index_t
ndim_low_
=
NDimLow
static
constexpr
index_t
ndim_up_
=
1
;
const
array
<
idnex_t
,
N
+
1
>
coefficients_
;
using
LowerIndex
=
MultiIndex
<
ndim_low_
>
;
using
UpperIndex
=
MultiIndex
<
ndum_up_
>
;
Array
<
index_t
,
NDimLow
>
low_lengths_
;
index_t
up_length_
;
__host__
__device__
static
constexpr
auto
GetNumOfLowerDimension
()
{
return
Number
<
ndim_low_
>
{};
}
__host__
__device__
static
constexpr
auto
GetNumOfUpperDimension
()
{
return
Number
<
ndim_up_
>
{};
}
__host__
__device__
static
constexpr
auto
GetUpperLengths
()
{
return
Array
<
index_t
,
1
>
up_length_
;
}
// emulate constexpr lambda
template
<
typename
PseudoLowStrides
>
struct
lambda_CalculateLowerIndex
{
index_t
&
itmp
;
LowerIndex
&
idx_low
;
__host__
__device__
explicit
constexpr
lambda_CalculateLowerIndex
(
index_t
&
itmp_
,
LowerIndex
&
idx_low_
)
:
itmp
(
itmp_
),
idx_low
(
idx_low_
)
{
}
template
<
typename
IDim
>
__host__
__device__
constexpr
void
operator
()(
IDim
idim
)
const
{
constexpr
index_t
stride
=
PseudoLowStrides
::
At
(
idim
);
idx_low
(
idim
)
=
itmp
/
stride
;
itmp
-=
idx_low
[
idim
]
*
stride
;
}
};
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
LowerIndex
idx_low
;
index_t
itmp
=
idx_up
[
0
];
constexpr
auto
pseudo_low_strides
=
reverse_inclusive_scan_sequence
(
LowerLengths
::
PopFront
(),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{})
.
PushBack
(
Number
<
1
>
{});
static_for
<
0
,
nDimLow
-
1
,
1
>
{}(
lambda_CalculateLowerIndex
<
decltype
(
pseudo_low_strides
)
>
(
itmp
,
idx_low
));
idx_low
(
nDimLow
-
1
)
=
itmp
/
pseudo_low_strides
[
nDimLow
-
1
];
return
idx_low
;
}
// idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
// If idx_up_diff is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
idx_low_old
)
{
if
(
idx_up_diff
[
0
]
==
0
)
{
return
make_zero_array
<
index_t
,
nDimLow
>
();
}
else
{
// CalculateLowerIndex(idx_up_diff) has multiple integer divisions.
// If idx_up_diff is known at compile-time, the calculation can
// be done at compile-time. However, if idx_up_diff is only known
// at run-time, then the calculation will also be computed at
// run-time, and can be very expensive.
LowerIndex
idx_low_diff_tmp
=
CalculateLowerIndex
(
idx_up_diff
);
// find out the last low dimension that changed
index_t
last_changed_low_dim
=
0
;
static_for
<
0
,
nDimLow
,
1
>
{}([
&
](
auto
i
)
{
if
(
idx_low_diff_tmp
[
i
]
!=
0
)
{
last_changed_low_dim
=
i
;
}
});
LowerIndex
idx_low_new
=
idx_low_old
+
idx_low_diff_tmp
;
if
(
idx_up_diff
[
0
]
>
0
)
{
// do carry check on each low dimension in reversed order
// starting from the first digit that changed
// don't check the highest dimension
bool
carry
=
false
;
static_for
<
nDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
if
(
i
<=
last_changed_low_dim
)
{
if
(
carry
)
{
++
idx_low_new
(
i
);
}
carry
=
false
;
if
(
idx_low_new
[
i
]
>=
LowerLengths
::
At
(
i
))
{
idx_low_new
(
i
)
-=
LowerLengths
::
At
(
i
);
carry
=
true
;
}
}
});
// highest dimension, no out-of-bound check
if
(
carry
)
{
++
idx_low_new
(
0
);
}
}
else
{
// do borrow check on each low dimension in reversed order
// starting from the first digit that changed
// don't check the highest dimension
bool
borrow
=
false
;
static_for
<
nDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
if
(
i
<=
last_changed_low_dim
)
{
if
(
borrow
)
{
--
idx_low_new
(
i
);
}
borrow
=
false
;
if
(
idx_low_new
[
i
]
<
0
)
{
idx_low_new
(
i
)
+=
LowerLengths
::
At
(
i
);
borrow
=
true
;
}
}
});
// highest dimension, no out-of-bound check
if
(
borrow
)
{
--
idx_low_new
(
0
);
}
}
return
idx_low_new
-
idx_low_old
;
}
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
false
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
__host__
__device__
constexpr
DynamicEmbed
(
coefficients
)
:
coefficients_
(
coefficients
)
{
{
return
true
;
}
}
};
};
...
...
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
View file @
b808713a
...
@@ -15,21 +15,6 @@ __host__ __device__ constexpr auto make_dynamic_native_tensor_descriptor(const L
...
@@ -15,21 +15,6 @@ __host__ __device__ constexpr auto make_dynamic_native_tensor_descriptor(const L
return
DynamicNativeTensorDescriptor
<
Lengths
::
GetSize
()
>
(
lengths
,
strides
);
return
DynamicNativeTensorDescriptor
<
Lengths
::
GetSize
()
>
(
lengths
,
strides
);
}
}
template
<
typename
LowTensorDescriptor
,
typename
Transforms
,
typename
LowDimensions
,
typename
UpDimensions
>
__host__
__device__
constexpr
auto
transform_dynamic_tensor_descriptor
(
const
LowTensorDescriptor
&
low_tensor_desc
,
const
Transforms
&
transforms
,
LowDimensions
,
UpDimensions
)
{
return
DynamicTransformedTensorDescriptor
<
LowTensorDescriptor
,
Transforms
,
LowDimensions
,
UpDimensions
>
(
low_tensor_desc
,
transforms
);
}
}
// namespace ck
}
// namespace ck
#endif
#endif
driver/include/device_dummy_dynamic_transform.hpp
View file @
b808713a
...
@@ -75,11 +75,7 @@ void device_dummy_dynamic_transform(InDesc,
...
@@ -75,11 +75,7 @@ void device_dummy_dynamic_transform(InDesc,
const
Array
<
index_t
,
2
>
,
const
Array
<
index_t
,
2
>
,
const
Array
<
index_t
,
2
>
,
const
Array
<
index_t
,
2
>
,
const
Array
<
index_t
,
2
>
,
const
Array
<
index_t
,
2
>
,
const
Array
<
index_t
,
2
>
,
const
Array
<
index_t
,
2
>>
,
index_t
,
index_t
,
index_t
,
index_t
>
,
dim3
(
GridSize
),
dim3
(
GridSize
),
dim3
(
BlockSize
),
dim3
(
BlockSize
),
0
,
0
,
...
@@ -93,11 +89,7 @@ void device_dummy_dynamic_transform(InDesc,
...
@@ -93,11 +89,7 @@ void device_dummy_dynamic_transform(InDesc,
conv_strides
,
conv_strides
,
conv_dilations
,
conv_dilations
,
in_left_pads
,
in_left_pads
,
in_right_pads
,
in_right_pads
);
10
,
10
,
10
,
10
);
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment