Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
4388f572
Commit
4388f572
authored
Aug 16, 2020
by
Chao Liu
Browse files
prototyping dynamic tensor descriptor
parent
b2098e70
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
857 additions
and
523 deletions
+857
-523
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
...rnel/include/kernel_algorithm/dummy_dynamic_transform.hpp
+104
-20
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
...lude/tensor_description/dynamic_multi_index_transform.hpp
+440
-6
composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
.../include/tensor_description/dynamic_tensor_descriptor.hpp
+2
-225
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
...e/tensor_description/dynamic_tensor_descriptor_helper.hpp
+0
-1
composable_kernel/include/utility/array.hpp
composable_kernel/include/utility/array.hpp
+1
-258
composable_kernel/include/utility/array_helper.hpp
composable_kernel/include/utility/array_helper.hpp
+297
-0
driver/src/conv_bwd_data_driver.cpp
driver/src/conv_bwd_data_driver.cpp
+13
-13
No files found.
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
View file @
4388f572
...
@@ -12,16 +12,16 @@ namespace ck {
...
@@ -12,16 +12,16 @@ namespace ck {
template
<
index_t
BlockSize
>
template
<
index_t
BlockSize
>
struct
DummyDynamicTransform
struct
DummyDynamicTransform
{
{
__device__
void
Run
(
index_t
*
const
__restrict__
p_wei_global
,
__device__
void
Run
_v1
(
index_t
*
const
__restrict__
p_wei_global
,
index_t
*
const
__restrict__
p_in_global
,
index_t
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_out_global
,
float
*
const
__restrict__
p_out_global
,
const
DynamicNativeTensorDescriptor
<
4
>
wei_k_c_y_x_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
wei_k_c_y_x_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
in_n_c_hi_wi_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
in_n_c_hi_wi_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
out_n_k_ho_wo_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
out_n_k_ho_wo_global_desc
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_right_pads
)
const
const
Array
<
index_t
,
2
>
in_right_pads
)
const
{
{
#if 1
#if 1
const
index_t
N
=
in_n_c_hi_wi_global_desc
.
GetLength
(
0
);
const
index_t
N
=
in_n_c_hi_wi_global_desc
.
GetLength
(
0
);
...
@@ -48,8 +48,8 @@ struct DummyDynamicTransform
...
@@ -48,8 +48,8 @@ struct DummyDynamicTransform
const
index_t
InRightPadH
=
in_right_pads
[
0
];
const
index_t
InRightPadH
=
in_right_pads
[
0
];
const
index_t
InRightPadW
=
in_right_pads
[
1
];
const
index_t
InRightPadW
=
in_right_pads
[
1
];
#else
#else
const
index_t
N
=
in_n_c_hi_wi_global_desc
.
GetLength
(
0
);
const
index_t
N
=
in_n_c_hi_wi_global_desc
.
GetLength
(
0
);
const
index_t
C
=
in_n_c_hi_wi_global_desc
.
GetLength
(
1
);
const
index_t
C
=
in_n_c_hi_wi_global_desc
.
GetLength
(
1
);
const
index_t
Y
=
3
;
const
index_t
Y
=
3
;
const
index_t
X
=
3
;
const
index_t
X
=
3
;
...
@@ -234,7 +234,7 @@ struct DummyDynamicTransform
...
@@ -234,7 +234,7 @@ struct DummyDynamicTransform
idx_low_diff
-=
negative_carry
;
idx_low_diff
-=
negative_carry
;
negative_carry
=
do_borrow
?
1
:
negative_carry
;
negative_carry
=
do_borrow
?
1
:
negative_carry
;
#endif
#endif
};
};
...
@@ -267,9 +267,9 @@ struct DummyDynamicTransform
...
@@ -267,9 +267,9 @@ struct DummyDynamicTransform
const_tmp
[
i
]
=
p_wei_global
[
i
+
1
];
const_tmp
[
i
]
=
p_wei_global
[
i
+
1
];
}
}
#else
#else
const_tmp
[
0
]
=
0
;
const_tmp
[
0
]
=
0
;
const_tmp
[
1
]
=
2
;
const_tmp
[
1
]
=
2
;
const_tmp
[
2
]
=
2
;
const_tmp
[
2
]
=
2
;
#endif
#endif
// initialize idx
// initialize idx
...
@@ -302,8 +302,9 @@ struct DummyDynamicTransform
...
@@ -302,8 +302,9 @@ struct DummyDynamicTransform
// PassThrough GemmN => GemmN
// PassThrough GemmN => GemmN
f_lower_idx_diff_passthrough
(
idx_diff
[
16
],
idx_diff
[
19
]);
f_lower_idx_diff_passthrough
(
idx_diff
[
16
],
idx_diff
[
19
]);
// stage 3
// stage 3
// Merge(C, Y, X) => GemmKTotal
// Merge(C, Y, X) => GemmKTotal
#if 0
f_lower_idx_diff_merge_v2(idx_diff[10],
f_lower_idx_diff_merge_v2(idx_diff[10],
idx_diff[11],
idx_diff[11],
idx_diff[13],
idx_diff[13],
...
@@ -317,6 +318,47 @@ struct DummyDynamicTransform
...
@@ -317,6 +318,47 @@ struct DummyDynamicTransform
C,
C,
Y,
Y,
X);
X);
#elif
0
index_t
tmp
=
idx_diff
[
15
];
const
index_t
const_tmp_0
=
tmp
/
(
Y
*
X
);
tmp
-=
const_tmp_0
*
(
Y
*
X
);
const
index_t
const_tmp_1
=
tmp
/
X
;
const
index_t
const_tmp_2
=
tmp
-
const_tmp_1
*
X
;
f_lower_idx_diff_merge_v2
(
idx_diff
[
10
],
idx_diff
[
11
],
idx_diff
[
13
],
idx_diff
[
15
],
idx
[
10
],
idx
[
11
],
idx
[
13
],
const_tmp_0
,
const_tmp_1
,
const_tmp_2
,
C
,
Y
,
X
);
#elif 1
index_t
tmp
=
idx_diff
[
15
];
const
index_t
const_tmp_0
=
__llvm_amdgcn_readfirstlane_i32
(
tmp
/
(
Y
*
X
));
tmp
-=
const_tmp_0
*
(
Y
*
X
);
const
index_t
const_tmp_1
=
__llvm_amdgcn_readfirstlane_i32
(
tmp
/
X
);
const
index_t
const_tmp_2
=
__llvm_amdgcn_readfirstlane_i32
(
tmp
-
const_tmp_1
*
X
);
f_lower_idx_diff_merge_v2
(
idx_diff
[
10
],
idx_diff
[
11
],
idx_diff
[
13
],
idx_diff
[
15
],
idx
[
10
],
idx
[
11
],
idx
[
13
],
const_tmp_0
,
const_tmp_1
,
const_tmp_2
,
C
,
Y
,
X
);
#endif
// stage 2
// stage 2
// PassThrough(N) => N
// PassThrough(N) => N
...
@@ -372,7 +414,7 @@ struct DummyDynamicTransform
...
@@ -372,7 +414,7 @@ struct DummyDynamicTransform
idx
[
13
]
+=
idx_diff
[
13
];
idx
[
13
]
+=
idx_diff
[
13
];
// padding check
// padding check
bool
is_in_bound
=
true
;
bool
is_in_bound
=
true
;
#else // pad
#else // pad
// offset
// offset
idx
[
0
]
+=
idx_diff
[
0
];
idx
[
0
]
+=
idx_diff
[
0
];
...
@@ -409,7 +451,39 @@ struct DummyDynamicTransform
...
@@ -409,7 +451,39 @@ struct DummyDynamicTransform
}
}
}
}
__device__
void
Run_
(
index_t
*
const
__restrict__
p_wei_global
,
__device__
void
Run_v2
(
index_t
*
const
__restrict__
p_wei_global
,
index_t
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_out_global
,
const
DynamicNativeTensorDescriptor
<
4
>
wei_k_c_y_x_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
in_n_c_hi_wi_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
out_n_k_ho_wo_global_desc
,
const
Array
<
index_t
,
2
>
conv_strides
,
const
Array
<
index_t
,
2
>
conv_dilations
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_right_pads
)
const
{
Index
idx_up
;
idx_up
(
0
)
=
in_n_c_hi_wi_global_desc
.
GetLength
(
0
);
idx_up
(
1
)
=
in_n_c_hi_wi_global_desc
.
GetLength
(
1
);
idx_up
(
2
)
=
in_n_c_hi_wi_global_desc
.
GetLength
(
2
);
idx_up
(
3
)
=
in_n_c_hi_wi_global_desc
.
GetLength
(
3
);
#if 0
constexpr auto trans = GetTransforms();
auto idx_low = trans[0]->CalculateLowerIndex(idx_up);
#elif
1
constexpr
DynamicCoordinateTransform
*
tran
=
&
embed
;
auto
idx_low
=
tran
->
CalculateLowerIndex
(
idx_up
);
#endif
p_out_global
[
get_thread_local_1d_id
()]
=
idx_low
[
0
];
}
__device__
void
Run
(
index_t
*
const
__restrict__
p_wei_global
,
index_t
*
const
__restrict__
p_in_global
,
index_t
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_out_global
,
float
*
const
__restrict__
p_out_global
,
const
DynamicNativeTensorDescriptor
<
4
>
wei_k_c_y_x_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
wei_k_c_y_x_global_desc
,
...
@@ -420,6 +494,16 @@ struct DummyDynamicTransform
...
@@ -420,6 +494,16 @@ struct DummyDynamicTransform
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_left_pads
,
const
Array
<
index_t
,
2
>
in_right_pads
)
const
const
Array
<
index_t
,
2
>
in_right_pads
)
const
{
{
Run_v2
(
p_wei_global
,
p_in_global
,
p_out_global
,
wei_k_c_y_x_global_desc
,
in_n_c_hi_wi_global_desc
,
out_n_k_ho_wo_global_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
);
}
}
};
};
...
...
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
View file @
4388f572
...
@@ -5,18 +5,452 @@
...
@@ -5,18 +5,452 @@
namespace
ck
{
namespace
ck
{
class
Dynamic
Transformation
struct
Dynamic
PassThrough
{
{
using
LowerIndex
=
MultiIndex
<
1
>
;
using
UpperIndex
=
MultiIndex
<
1
>
;
const
index_t
low_length_
;
__host__
__device__
explicit
constexpr
DynamicPassThrough
(
const
index_t
&
low_length
)
:
low_length_
(
low_length
)
{
}
__host__
__device__
constexpr
index_t
GetNumOfLowerDimension
()
{
return
1
;
}
__host__
__device__
constexpr
index_t
GetNumOfUpperDimension
()
{
return
1
;
}
__host__
__device__
constexpr
auto
GetUpperIndex
()
{
return
UpperIndex
({
low_length_
});
}
__host__
__device__
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
return
idx_up
;
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
return
idx_up_diff
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexMappedToValidLowerIndex
(
const
UpperIndex
&
/* idx_up */
)
{
return
true
;
}
};
};
template
<
index_t
N
>
template
<
bool
SkipIsValidCheck
=
false
>
class
DynamicEmbed
:
public
DynamicTransformation
struct
DynamicLeftPad
{
{
const
array
<
idnex_t
,
N
+
1
>
coefficients_
;
using
LowerIndex
=
MultiIndex
<
1
>
;
using
UpperIndex
=
MultiIndex
<
1
>
;
const
index_t
low_length_
;
const
index_t
left_pad_
;
__host__
__device__
explicit
constexpr
Pad
(
const
index_t
&
low_length
,
const
index_t
&
left_pad
)
:
low_length_
{
low_length
},
left_pad_
{
left_pad
}
{
}
__host__
__device__
constexpr
index_t
GetNumOfLowerDimension
()
{
return
1
;
}
__host__
__device__
constexpr
index_t
GetNumOfUpperDimension
()
{
return
1
;
}
__host__
__device__
constexpr
auto
GetUpperIndex
()
{
return
UpperIndex
({
low_length_
+
left_pad_
});
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
return
LowerIndex
{
idx_up
-
lef_pad_
};
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
return
idx_up_diff
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
{
return
SkipIsValidCheck
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexMappedToValidLowerIndex
(
const
UpperIndex
&
idx_up
)
{
return
SkipIsValidCheck
||
(
idx_up
[
0
]
>=
left_pad_
);
}
};
template
<
bool
SkipIsValidCheck
=
false
>
struct
DynamicRightPad
{
using
LowerIndex
=
MultiIndex
<
1
>
;
using
UpperIndex
=
MultiIndex
<
1
>
;
const
index_t
low_length_
;
const
index_t
right_pad_
;
__host__
__device__
explicit
constexpr
Pad
(
const
index_t
&
low_length
,
const
index_t
&
right_pad
)
:
low_length_
{
low_length
},
right_pad_
{
right_pad
}
{
}
__host__
__device__
constexpr
index_t
GetNumOfLowerDimension
()
{
return
1
;
}
__host__
__device__
constexpr
index_t
GetNumOfUpperDimension
()
{
return
1
;
}
__host__
__device__
constexpr
auto
GetUpperIndex
()
{
return
UpperIndex
({
low_length_
+
right_pad_
});
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
return
idx_up
;
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
return
idx_up_diff
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
{
return
SkipIsValidCheck
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexMappedToValidLowerIndex
(
const
UpperIndex
&
idx_up
)
{
return
SkipIsValidCheck
||
(
idx_up
[
0
]
<
low_length_
);
}
};
// idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1] + coefficients[nDimUp]
template
<
index_t
NDimUp
>
struct
DynamicEmbed
{
using
LowerIndex
=
MultiIndex
<
1
>
;
using
UpperIndex
=
MultiIndex
<
NDimUp
>
;
const
index_t
low_length_
;
const
UpperIndex
up_lengths_
;
const
Array
<
index_t
,
NDimUp
+
1
>
coefficients_
;
__host__
__device__
explicit
constexpr
Embed
(
const
index_t
&
low_length
,
const
UpperIndex
&
up_lengths
,
const
Array
<
index_t
,
NDimUp
+
1
>&
coefficients
)
:
low_length_
(
low_length
),
up_lengths_
(
up_lengths
),
coefficients_
(
coefficients
)
{
static_assert
(
up_lengths
.
GetSize
()
==
nDimUp
&&
coefficients
.
GetSize
()
==
nDimUp
+
1
,
"wrong! # of dimensions not consistent"
);
}
__host__
__device__
static
constexpr
index_t
GetNumOfUpperDimension
()
{
return
1
;
}
__host__
__device__
static
constexpr
index_t
GetNumOfLowerDimension
()
{
return
NDimUp
;
}
__host__
__device__
static
constexpr
auto
GetUpperIndex
()
{
return
up_lengths_
;
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
index_t
idx_low
=
coefficients_
[
NDimUp
];
for
(
index_t
i
=
0
;
i
<
nDimUp
;
++
i
)
{
idx_low
+=
idx_up
[
i
]
*
coefficients_
[
i
];
}
return
LowerIndex
({
idx_low
});
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
index_t
idx_low_diff
=
0
;
for
(
index_t
i
=
0
;
i
<
nDimUp
;
++
i
)
{
idx_low_diff
+=
idx_up_diff
[
i
]
*
Coefficients
{}[
i
];
}
return
LowerIndex
({
idx_low_diff
});
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexMappedToValidLowerIndex
(
const
UpperIndex
&
/* idx_up */
)
{
return
true
;
}
};
template
<
index_t
NDimLow
>
struct
DynamicMerge
{
using
LowerIndex
=
MultiIndex
<
NDimLow
>
;
using
UpperIndex
=
MultiIndex
<
1
>
;
const
LowerIndex
low_lengths_
;
const
LowerIndex
low_lengths_scan_
;
const
index_t
up_length_
;
__host__
__device__
explicit
constexpr
DynamicMerge
(
const
LowerIndex
&
low_lengths_
)
:
low_lengths_
(
low_lengths
),
low_lengths_scan_
(
reverse_inclusive_scan_on_array
(
low_lengths
,
multiplies
<
index_t
>
()),
up_length
(
accumulate_on_array
(
low_lengths
,
multiplies
<
index_t
>
(),
1
))
{
}
__host__
__device__
static
constexpr
index_t
GetNumOfLowerDimension
()
{
return
NDimLow
;
}
__host__
__device__
static
constexpr
index_t
GetNumOfUpperDimension
()
{
return
1
;
}
__host__
__device__
constexpr
auto
GetUpperIndex
()
const
{
return
UpperIndex
({
up_length_
});
}
__host__
__device__
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
const
{
LowerIndex
idx_low
;
index_t
itmp
=
idx_up
[
0
];
#pragma unroll
for
(
index_t
i
;
i
<
NDimLow
-
1
;
++
i
)
{
idx_low
(
i
)
=
itmp
/
low_lengths_scan_
[
i
];
itmp
-=
idx_low
[
i
]
*
low_lengths_scan_
[
i
];
}
idx_low
(
NDimLow
-
1
)
=
itmp
;
#pragma unroll
return
idx_low
;
}
// idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
// If idx_up_diff is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
idx_low_old
)
{
LowerIndex
idx_low_diff
;
// CalculateLowerIndex(idx_up_diff) has multiple integer divisions.
// 1) If idx_up_diff is known at compile-time, then idx_low_diff_const
// can be calculated at compile-time.
// 2) If idx_up_diff is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then idx_low_diff_const also
// doesn't change during the whole kernel execution. Compiler generated ISA should
// only caclculate idx_low_diff once and save it durinng the whole kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex
idx_low_diff_const
=
CalculateLowerIndex
(
idx_up_diff
);
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t
carry
=
0
;
#pragma unroll
for
(
index_t
i
=
NDimLow
-
1
;
i
>
1
;
--
i
)
{
// this should be saved as well
index_t
idx_low_length_minus_idx_low_diff_const
=
low_lengths_
[
i
]
-
idx_low_diff_const
[
i
];
#if 0
index_t idx_low_length_plus_idx_low_diff_const =
low_lengths_[i] + idx_low_diff_const[i];
#endif
index_t
idx_low_tmp
[
i
]
=
idx_low_old
[
i
]
+
carry
;
bool
do_carry
=
idx_low_tmp
>=
idx_low_length_minus_idx_low_diff_const
;
#if 0
bool do_borrow = idx_low_tmp < -idx_low_diff_const[i];
#endif
idx_low_diff
[
i
]
=
do_carry
?
-
idx_low_length_minus_idx_low_diff_const
:
idx_low_diff_const
;
#if 0
idx_low_diff[i] =
do_borrow ? idx_low_length_plus_idx_low_diff_const : idx_low_diff[i];
#endif
idx_low_diff
[
i
]
+=
carry
;
carry
=
do_carry
?
1
:
0
;
#if 0
carry = do_borrow ? -1 : carry;
#endif
}
idx_low_diff
[
0
]
=
idx_low_diff_const
[
0
]
+
carry
;
return
idx_low_diff
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
false
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexMappedToValidLowerIndex
(
const
UpperIndex
&
/* idx_up */
)
{
return
true
;
}
};
template
<
index_t
NDimUp
>
struct
DynamicUnMerge
{
using
LowerIndex
=
MultiIndex
<
1
>
;
using
UpperIndex
=
MultiIndex
<
NDimUp
>
;
const
UpperIndex
up_lengths_
;
const
UpperIndex
up_lengths_scan_
;
__host__
__device__
explicit
constexpr
DynamicUnMerge
(
const
UpperIndex
&
up_lengths
)
:
up_lengths_
(
up_lengths
),
up_lengths_scan_
(
reverse_exclusive_scan_on_array
(
up_lengths
,
multiplies
<
index_t
>
(),
index_t
(
1
))
{
}
__host__
__device__
static
constexpr
index_t
GetNumOfLowerDimension
()
{
return
1
;
}
__host__
__device__
static
constexpr
index_t
GetNumOfUpperDimension
()
{
return
NDimUp
;
}
__host__
__device__
constexpr
auto
GetUpperIndex
()
const
{
return
up_lengths_
;
}
__host__
__device__
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
const
{
index_t
idx_low
=
idx_up
[
NDimUp
];
#pragma unroll
for
(
index_t
i
=
0
;
i
<
NDimUp
-
1
;
++
i
)
{
idx_low
+=
idx_up
[
i
]
*
up_lengths_scan_
[
i
];
}
return
LowerIndex
{
idx_low
};
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
return
CalculateLowerIndex
(
idx_up_diff
);
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexMappedToValidLowerIndex
(
const
UpperIndex
&
/* idx_up */
)
{
return
true
;
}
};
struct
DynamicFreeze
{
using
LowerIndex
=
MultiIndex
<
1
>
;
using
UpperIndex
=
MultiIndex
<>
;
const
index_t
low_idx_
;
const
index_t
low_length_
;
__host__
__device__
explicit
constexpr
Freeze
(
const
index_t
&
low_idx
,
const
index_t
&
low_length
)
:
low_idx_
(
low_idx
),
low_length_
(
low_length
)
{
}
__host__
__device__
static
constexpr
index_t
GetNumOfLowerDimension
()
{
return
1
;
}
__host__
__device__
static
constexpr
index_t
GetNumOfUpperDimension
()
{
return
0
;
}
__host__
__device__
static
constexpr
auto
GetUpperIndex
()
{
return
UpperIndex
();
}
__host__
__device__
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
/*idx_up*/
)
const
{
return
LowerIndex
({
low_length_
});
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
/* idx_up_diff */
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
return
LowerIndex
({
0
});
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
{
return
true
;
}
__host__
__device__
constexpr
DynamicEmbed
(
coefficients
)
__host__
__device__
static
constexpr
bool
:
coefficients_
(
coefficients
)
IsValidUpperIndexMappedToValidLowerIndex
(
const
UpperIndex
&
/* idx_up */
)
{
{
return
true
;
}
}
};
};
...
...
composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
View file @
4388f572
...
@@ -2,236 +2,13 @@
...
@@ -2,236 +2,13 @@
#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HPP
#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HPP
#include "common_header.hpp"
#include "common_header.hpp"
#include "dynamic_multi_index_transform.hpp"
namespace
ck
{
namespace
ck
{
template
<
index_t
NDim
>
struct
TensorDescriptor
struct
DynamicNativeTensorDescriptor
{
{
using
Index
=
MultiIndex
<
NDim
>
;
Array
<
index_t
,
NDim
>
lengths_
;
Array
<
index_t
,
NDim
>
strides_
;
index_t
element_size_
;
index_t
element_space_
;
template
<
typename
Lengths
,
typename
Strides
>
__host__
__device__
constexpr
DynamicNativeTensorDescriptor
(
const
Lengths
&
lengths
,
const
Strides
&
strides
)
:
lengths_
(
lengths
),
strides_
(
strides
)
{
element_size_
=
1
;
for
(
index_t
i
=
0
;
i
<
NDim
;
++
i
)
{
element_size_
*=
lengths_
[
i
];
}
element_space_
=
1
;
for
(
index_t
i
=
0
;
i
<
NDim
;
++
i
)
{
element_space_
+=
(
lengths_
[
i
]
-
1
)
*
strides_
[
i
];
}
}
__host__
__device__
static
constexpr
auto
GetNumOfDimension
()
{
return
NDim
;
}
__host__
__device__
constexpr
auto
GetLength
(
const
index_t
&
i
)
const
{
return
lengths_
[
i
];
}
__host__
__device__
constexpr
auto
GetStride
(
const
index_t
&
i
)
const
{
return
strides_
[
i
];
}
__host__
__device__
constexpr
auto
GetLengths
()
const
{
return
lengths_
;
}
__host__
__device__
constexpr
auto
GetStrides
()
const
{
return
strides_
;
}
__host__
__device__
constexpr
auto
GetElementSize
()
const
{
return
element_size_
;
}
__host__
__device__
constexpr
auto
GetElementSpace
()
const
{
return
element_space_
;
}
__host__
__device__
constexpr
auto
CalculateOffset
(
const
Index
&
idx
)
const
{
index_t
offset
=
0
;
#pragma unroll
for
(
index_t
i
=
0
;
i
<
NDim
;
++
i
)
{
offset
+=
idx
[
i
]
*
strides_
[
i
];
}
return
offset
;
}
__host__
__device__
constexpr
auto
CalculateOffsetDiff
(
const
Index
&
idx_diff
)
const
{
index_t
offset_diff
=
0
;
#pragma unroll
for
(
index_t
i
=
0
;
i
<
NDim
;
++
i
)
{
offset_diff
+=
idx_diff
[
i
]
*
strides_
[
i
];
}
return
offset_diff
;
}
__host__
__device__
constexpr
bool
IsUpperIndexValid
(
const
Index
&
idx
)
const
{
bool
flag
=
true
;
#pragma unroll
for
(
index_t
i
=
0
;
i
<
NDim
;
++
i
)
{
flag
=
flag
&&
idx
[
i
]
>=
0
&&
idx
[
i
]
<
lengths_
[
i
];
}
return
flag
;
}
};
};
#if 0
// Tensor descriptor for "transformed tensor"
template <typename LowTensorDescriptor,
typename Transforms, // Tuple<DynamicMultIndexTransforms,...>
typename LowDimensions, // Tuple<Sequence<...>,...>
typename UpDimensions> // Tuple<Sequence<...>,...>
struct DynamicTransformedTensorDescriptor
{
using Type = DynamicTransformedTensorDescriptor;
__host__ __device__ static constexpr auto GetNumOfLowerDimension()
{
// Here, we assume all lower-dimensions are active
// TODO: sanity-check all lower-dimension are indeed active
using duplicated_low_active_dims =
decltype(unpack(lambda_merge_sequences{}, LowDimensions{}));
using low_active_dims = typename sequence_unique_sort<duplicated_low_active_dims,
math::less<index_t>,
math::equal<index_t>>::type;
return low_active_dims::Size();
}
__host__ __device__ static constexpr auto GetNumOfUpperDimension()
{
using duplicated_up_active_dims =
decltype(unpack(lambda_merge_sequences{}, UpDimensions{}));
using up_active_dims = typename sequence_unique_sort<duplicated_up_active_dims,
math::less<index_t>,
math::equal<index_t>>::type;
return up_active_dims::Size();
}
static constexpr index_t ndim_up_ = GetNumOfUpperDimension();
static constexpr index_t ndim_low_ = GetNumOfLowerDimension();
static constexpr index_t num_transform_ = Transforms::Size();
using UpperIndex = MultiIndex<ndim_up_>;
using LowerIndex = MultiIndex<ndim_low_>;
const LowTensorDescriptor low_tensor_desc_;
const Transforms transforms_;
const LowDimensions low_dims_;
const UpDimensions up_dims_;
__host__ __device__ constexpr TransformedTensorDescriptor(const LowTensorDescriptor& low_tensor_desc,
const Transforms& transforms)
: low_tensor_desc_(low_tensor_desc),
transforms_(transforms)
{
}
__host__ __device__ static constexpr auto GetNumOfDimension()
{
return GetNumOfUpperDimension();
}
__host__ __device__ constexpr auto GetLowerTensorDescriptor() const
{
return low_dims_;
}
__host__ __device__ constexpr auto GetUpperLengths() cons
{
}
__host__ __device__ constexpr auto GetLengths() const { return GetUpperLengths(); }
__host__ __device__ constexpr auto GetLength(index_t i) const
{
return GetLengths()[i];
}
__host__ __device__ constexpr auto GetElementSize() const
{
index_t element_size = 1;
for(index_t i = 0; i < ndim_up_; ++i)
{
element_size *= GetLength(i);
}
return element_size;
}
__host__ __device__ constexpr auto GetElementSpace() const
{
return lower_tensor_desc_.GetElementSpace();
}
// TODO: right now return value is not constexpr because use of non-constexpr lambda
__host__ __device__ constexpr LowerIndex CalculateLowerIndex(const UpperIndex& idx_up) const
{
LowerIndex idx_low;
static_for<0, num_transform_, 1>{}([&](auto itran) {
constexpr auto tran = Transforms{}.At(itran);
const auto idx_up_part = pick_array_element(idx_up, UpDimensions{}.At(itran));
auto idx_low_part = pick_array_element(idx_low, LowDimensions{}.At(itran));
// this assume each lower (single) index is only assocaited with one transformation,
// which is required for index transformation, and has been checked during constructor
// of TransformedTensorDescriptor
idx_low_part = tran.CalculateLowerIndex(to_array(idx_up_part));
});
return idx_low;
}
// TODO: right now return value is not constexpr because use of non-constepxr lambda
__host__ __device__ static constexpr LowerIndex CalculateLowerIndexDiff(
const UpperIndex& idx_up_diff, const UpperIndex& idx_up_old, const LowerIndex& idx_low_old)
{
LowerIndex idx_low_diff;
static_for<0, nTransform, 1>{}([&](auto itran) {
constexpr auto tran = Transforms{}.At(itran);
const auto idx_up_diff_part =
pick_array_element(idx_up_diff, UpDimensions{}.At(itran));
const auto idx_up_old_part = pick_array_element(idx_up_old, UpDimensions{}.At(itran));
const auto idx_low_old_part =
pick_array_element(idx_low_old, LowDimensions{}.At(itran));
auto idx_low_diff_part = pick_array_element(idx_low_diff, LowDimensions{}.At(itran));
// this assume each lower (single) index is associated with only one transformation,
// which is required for index transformation, and has been checked during constructor
// of TransformedTensorDescriptor
idx_low_diff_part = tran.CalculateLowerIndexDiff(
to_array(idx_up_diff_part), to_array(idx_up_old_part), to_array(idx_low_old_part));
});
return idx_low_diff;
}
__host__ __device__ static constexpr index_t CalculateOffset(const UpperIndex& idx_up)
{
return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up));
}
};
#endif
}
// namespace ck
}
// namespace ck
#endif
#endif
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
View file @
4388f572
...
@@ -15,6 +15,5 @@ __host__ __device__ constexpr auto make_dynamic_native_tensor_descriptor(const L
...
@@ -15,6 +15,5 @@ __host__ __device__ constexpr auto make_dynamic_native_tensor_descriptor(const L
return
DynamicNativeTensorDescriptor
<
Lengths
::
GetSize
()
>
(
lengths
,
strides
);
return
DynamicNativeTensorDescriptor
<
Lengths
::
GetSize
()
>
(
lengths
,
strides
);
}
}
}
// namespace ck
}
// namespace ck
#endif
#endif
composable_kernel/include/utility/array.hpp
View file @
4388f572
...
@@ -13,7 +13,7 @@ struct Array
...
@@ -13,7 +13,7 @@ struct Array
using
data_type
=
TData
;
using
data_type
=
TData
;
// TODO: implement empty Array
// TODO: implement empty Array
index_t
mData
[
NSize
];
TData
mData
[
NSize
]
=
{
0
}
;
__host__
__device__
explicit
constexpr
Array
()
{}
__host__
__device__
explicit
constexpr
Array
()
{}
...
@@ -159,262 +159,5 @@ struct ArrayElementPicker
...
@@ -159,262 +159,5 @@ struct ArrayElementPicker
Arr
&
mArray
;
Arr
&
mArray
;
};
};
template
<
typename
Arr
,
typename
Picks
>
__host__
__device__
constexpr
auto
pick_array_element
(
Arr
&
a
,
Picks
)
{
return
ArrayElementPicker
<
Arr
,
Picks
>
(
a
);
}
template
<
typename
T
>
__host__
__device__
constexpr
auto
to_array
(
const
T
&
x
)
{
Array
<
typename
T
::
data_type
,
T
::
Size
()
>
y
;
static_for
<
0
,
T
::
Size
(),
1
>
{}([
&
](
auto
i
)
{
y
.
At
(
i
)
=
x
.
At
(
i
);
});
return
y
;
}
// TODO: remove this
template
<
index_t
...
Is
>
__host__
__device__
constexpr
auto
sequence2array
(
Sequence
<
Is
...
>
)
{
return
Array
<
index_t
,
sizeof
...(
Is
)
>
{
Is
...};
}
template
<
typename
TData
,
index_t
NSize
>
__host__
__device__
constexpr
auto
make_zero_array
()
{
constexpr
auto
zero_sequence
=
typename
uniform_sequence_gen
<
NSize
,
0
>::
type
{};
constexpr
auto
zero_array
=
sequence2array
(
zero_sequence
);
return
zero_array
;
}
template
<
typename
TData
,
index_t
NSize
,
index_t
...
IRs
>
__host__
__device__
constexpr
auto
reorder_array_given_new2old
(
const
Array
<
TData
,
NSize
>&
old_array
,
Sequence
<
IRs
...
>
/*new2old*/
)
{
static_assert
(
NSize
==
sizeof
...(
IRs
),
"NSize not consistent"
);
static_assert
(
is_valid_sequence_map
<
Sequence
<
IRs
...
>>
{},
"wrong! invalid reorder map"
);
return
Array
<
TData
,
NSize
>
{
old_array
[
IRs
]...};
}
template
<
typename
TData
,
index_t
NSize
,
typename
MapOld2New
>
struct
lambda_reorder_array_given_old2new
{
const
Array
<
TData
,
NSize
>&
old_array
;
Array
<
TData
,
NSize
>&
new_array
;
__host__
__device__
constexpr
lambda_reorder_array_given_old2new
(
const
Array
<
TData
,
NSize
>&
old_array_
,
Array
<
TData
,
NSize
>&
new_array_
)
:
old_array
(
old_array_
),
new_array
(
new_array_
)
{
}
template
<
index_t
IOldDim
>
__host__
__device__
constexpr
void
operator
()(
Number
<
IOldDim
>
)
const
{
TData
old_data
=
old_array
[
IOldDim
];
constexpr
index_t
INewDim
=
MapOld2New
::
At
(
Number
<
IOldDim
>
{});
new_array
(
Number
<
INewDim
>
{})
=
old_data
;
}
};
template
<
typename
TData
,
index_t
NSize
,
index_t
...
IRs
>
__host__
__device__
constexpr
auto
reorder_array_given_old2new
(
const
Array
<
TData
,
NSize
>&
old_array
,
Sequence
<
IRs
...
>
/*old2new*/
)
{
Array
<
TData
,
NSize
>
new_array
;
static_assert
(
NSize
==
sizeof
...(
IRs
),
"NSize not consistent"
);
static_assert
(
is_valid_sequence_map
<
Sequence
<
IRs
...
>>::
value
,
"wrong! invalid reorder map"
);
static_for
<
0
,
NSize
,
1
>
{}(
lambda_reorder_array_given_old2new
<
TData
,
NSize
,
Sequence
<
IRs
...
>>
(
old_array
,
new_array
));
return
new_array
;
}
template
<
typename
TData
,
index_t
NSize
,
typename
ExtractSeq
>
__host__
__device__
constexpr
auto
extract_array
(
const
Array
<
TData
,
NSize
>&
old_array
,
ExtractSeq
)
{
Array
<
TData
,
ExtractSeq
::
GetSize
()
>
new_array
;
constexpr
index_t
new_size
=
ExtractSeq
::
GetSize
();
static_assert
(
new_size
<=
NSize
,
"wrong! too many extract"
);
static_for
<
0
,
new_size
,
1
>
{}([
&
](
auto
I
)
{
new_array
(
I
)
=
old_array
[
ExtractSeq
::
At
(
I
)];
});
return
new_array
;
}
// emulate constepxr lambda for array
template
<
typename
F
,
typename
X
,
typename
Y
,
typename
Z
>
struct
lambda_array_math
{
const
F
&
f
;
const
X
&
x
;
const
Y
&
y
;
Z
&
z
;
__host__
__device__
constexpr
lambda_array_math
(
const
F
&
f_
,
const
X
&
x_
,
const
Y
&
y_
,
Z
&
z_
)
:
f
(
f_
),
x
(
x_
),
y
(
y_
),
z
(
z_
)
{
}
template
<
index_t
IDim_
>
__host__
__device__
constexpr
void
operator
()(
Number
<
IDim_
>
)
const
{
constexpr
auto
IDim
=
Number
<
IDim_
>
{};
z
(
IDim
)
=
f
(
x
[
IDim
],
y
[
IDim
]);
}
};
// Array = Array + Array
template
<
typename
TData
,
index_t
NSize
>
__host__
__device__
constexpr
auto
operator
+
(
Array
<
TData
,
NSize
>
a
,
Array
<
TData
,
NSize
>
b
)
{
Array
<
TData
,
NSize
>
result
;
auto
f
=
math
::
plus
<
index_t
>
{};
static_for
<
0
,
NSize
,
1
>
{}(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
f
,
a
,
b
,
result
));
return
result
;
}
// Array = Array - Array
template
<
typename
TData
,
index_t
NSize
>
__host__
__device__
constexpr
auto
operator
-
(
Array
<
TData
,
NSize
>
a
,
Array
<
TData
,
NSize
>
b
)
{
Array
<
TData
,
NSize
>
result
;
auto
f
=
math
::
minus
<
index_t
>
{};
static_for
<
0
,
NSize
,
1
>
{}(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
f
,
a
,
b
,
result
));
return
result
;
}
// Array += Array
template
<
typename
TData
,
index_t
NSize
>
__host__
__device__
constexpr
auto
operator
+=
(
Array
<
TData
,
NSize
>&
a
,
Array
<
TData
,
NSize
>
b
)
{
a
=
a
+
b
;
return
a
;
}
// Array -= Array
template
<
typename
TData
,
index_t
NSize
>
__host__
__device__
constexpr
auto
operator
-=
(
Array
<
TData
,
NSize
>&
a
,
Array
<
TData
,
NSize
>
b
)
{
a
=
a
-
b
;
return
a
;
}
// Array = Array + Sequence
template
<
typename
TData
,
index_t
NSize
,
index_t
...
Is
>
__host__
__device__
constexpr
auto
operator
+
(
Array
<
TData
,
NSize
>
a
,
Sequence
<
Is
...
>
b
)
{
static_assert
(
sizeof
...(
Is
)
==
NSize
,
"wrong! size not the same"
);
Array
<
TData
,
NSize
>
result
;
auto
f
=
math
::
plus
<
index_t
>
{};
static_for
<
0
,
NSize
,
1
>
{}(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
f
,
a
,
b
,
result
));
return
result
;
}
// Array = Array - Sequence
template
<
typename
TData
,
index_t
NSize
,
index_t
...
Is
>
__host__
__device__
constexpr
auto
operator
-
(
Array
<
TData
,
NSize
>
a
,
Sequence
<
Is
...
>
b
)
{
static_assert
(
sizeof
...(
Is
)
==
NSize
,
"wrong! size not the same"
);
Array
<
TData
,
NSize
>
result
;
auto
f
=
math
::
minus
<
index_t
>
{};
static_for
<
0
,
NSize
,
1
>
{}(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
f
,
a
,
b
,
result
));
return
result
;
}
// Array = Array * Sequence
template
<
typename
TData
,
index_t
NSize
,
index_t
...
Is
>
__host__
__device__
constexpr
auto
operator
*
(
Array
<
TData
,
NSize
>
a
,
Sequence
<
Is
...
>
b
)
{
static_assert
(
sizeof
...(
Is
)
==
NSize
,
"wrong! size not the same"
);
Array
<
TData
,
NSize
>
result
;
auto
f
=
math
::
multiplies
<
index_t
>
{};
static_for
<
0
,
NSize
,
1
>
{}(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
f
,
a
,
b
,
result
));
return
result
;
}
// Array = Sequence - Array
template
<
typename
TData
,
index_t
NSize
,
index_t
...
Is
>
__host__
__device__
constexpr
auto
operator
-
(
Sequence
<
Is
...
>
a
,
Array
<
TData
,
NSize
>
b
)
{
static_assert
(
sizeof
...(
Is
)
==
NSize
,
"wrong! size not the same"
);
Array
<
TData
,
NSize
>
result
;
auto
f
=
math
::
minus
<
index_t
>
{};
static_for
<
0
,
NSize
,
1
>
{}(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
f
,
a
,
b
,
result
));
return
result
;
}
// Array = Array * TData
template
<
typename
TData
,
index_t
NSize
>
__host__
__device__
constexpr
auto
operator
*
(
TData
v
,
Array
<
TData
,
NSize
>
a
)
{
Array
<
TData
,
NSize
>
result
;
for
(
index_t
i
=
0
;
i
<
NSize
;
++
i
)
{
result
(
i
)
=
a
[
i
]
*
v
;
}
return
result
;
}
template
<
typename
TData
,
index_t
NSize
,
typename
Reduce
>
__host__
__device__
constexpr
TData
accumulate_on_array
(
const
Array
<
TData
,
NSize
>&
a
,
Reduce
f
,
TData
init
)
{
TData
result
=
init
;
static_assert
(
NSize
>
0
,
"wrong"
);
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
result
=
f
(
result
,
a
[
I
]);
});
return
result
;
}
}
// namespace ck
}
// namespace ck
#endif
#endif
composable_kernel/include/utility/array_helper.hpp
0 → 100644
View file @
4388f572
#ifndef CK_ARRAY_HELPER_HPP
#define CK_ARRAY_HELPER_HPP
#include "array.hpp"
namespace
ck
{
template
<
typename
Arr
,
typename
Picks
>
__host__
__device__
constexpr
auto
pick_array_element
(
Arr
&
a
,
Picks
)
{
return
ArrayElementPicker
<
Arr
,
Picks
>
(
a
);
}
template
<
typename
T
>
__host__
__device__
constexpr
auto
to_array
(
const
T
&
x
)
{
Array
<
typename
T
::
data_type
,
T
::
Size
()
>
y
;
static_for
<
0
,
T
::
Size
(),
1
>
{}([
&
](
auto
i
)
{
y
.
At
(
i
)
=
x
.
At
(
i
);
});
return
y
;
}
template
<
typename
TData
,
index_t
NSize
>
__host__
__device__
constexpr
auto
make_zero_array
()
{
constexpr
auto
zero_sequence
=
typename
uniform_sequence_gen
<
NSize
,
0
>::
type
{};
constexpr
auto
zero_array
=
to_array
(
zero_sequence
);
return
zero_array
;
}
template
<
typename
TData
,
index_t
NSize
,
index_t
...
IRs
>
__host__
__device__
constexpr
auto
reorder_array_given_new2old
(
const
Array
<
TData
,
NSize
>&
old_array
,
Sequence
<
IRs
...
>
/*new2old*/
)
{
static_assert
(
NSize
==
sizeof
...(
IRs
),
"NSize not consistent"
);
static_assert
(
is_valid_sequence_map
<
Sequence
<
IRs
...
>>
{},
"wrong! invalid reorder map"
);
return
Array
<
TData
,
NSize
>
{
old_array
[
IRs
]...};
}
template
<
typename
TData
,
index_t
NSize
,
typename
MapOld2New
>
struct
lambda_reorder_array_given_old2new
{
const
Array
<
TData
,
NSize
>&
old_array
;
Array
<
TData
,
NSize
>&
new_array
;
__host__
__device__
constexpr
lambda_reorder_array_given_old2new
(
const
Array
<
TData
,
NSize
>&
old_array_
,
Array
<
TData
,
NSize
>&
new_array_
)
:
old_array
(
old_array_
),
new_array
(
new_array_
)
{
}
template
<
index_t
IOldDim
>
__host__
__device__
constexpr
void
operator
()(
Number
<
IOldDim
>
)
const
{
TData
old_data
=
old_array
[
IOldDim
];
constexpr
index_t
INewDim
=
MapOld2New
::
At
(
Number
<
IOldDim
>
{});
new_array
(
Number
<
INewDim
>
{})
=
old_data
;
}
};
template
<
typename
TData
,
index_t
NSize
,
index_t
...
IRs
>
__host__
__device__
constexpr
auto
reorder_array_given_old2new
(
const
Array
<
TData
,
NSize
>&
old_array
,
Sequence
<
IRs
...
>
/*old2new*/
)
{
Array
<
TData
,
NSize
>
new_array
;
static_assert
(
NSize
==
sizeof
...(
IRs
),
"NSize not consistent"
);
static_assert
(
is_valid_sequence_map
<
Sequence
<
IRs
...
>>::
value
,
"wrong! invalid reorder map"
);
static_for
<
0
,
NSize
,
1
>
{}(
lambda_reorder_array_given_old2new
<
TData
,
NSize
,
Sequence
<
IRs
...
>>
(
old_array
,
new_array
));
return
new_array
;
}
template
<
typename
TData
,
index_t
NSize
,
typename
ExtractSeq
>
__host__
__device__
constexpr
auto
extract_array
(
const
Array
<
TData
,
NSize
>&
old_array
,
ExtractSeq
)
{
Array
<
TData
,
ExtractSeq
::
GetSize
()
>
new_array
;
constexpr
index_t
new_size
=
ExtractSeq
::
GetSize
();
static_assert
(
new_size
<=
NSize
,
"wrong! too many extract"
);
static_for
<
0
,
new_size
,
1
>
{}([
&
](
auto
I
)
{
new_array
(
I
)
=
old_array
[
ExtractSeq
::
At
(
I
)];
});
return
new_array
;
}
// emulate constepxr lambda for array
template
<
typename
F
,
typename
X
,
typename
Y
,
typename
Z
>
struct
lambda_array_math
{
const
F
&
f
;
const
X
&
x
;
const
Y
&
y
;
Z
&
z
;
__host__
__device__
constexpr
lambda_array_math
(
const
F
&
f_
,
const
X
&
x_
,
const
Y
&
y_
,
Z
&
z_
)
:
f
(
f_
),
x
(
x_
),
y
(
y_
),
z
(
z_
)
{
}
template
<
index_t
IDim_
>
__host__
__device__
constexpr
void
operator
()(
Number
<
IDim_
>
)
const
{
constexpr
auto
IDim
=
Number
<
IDim_
>
{};
z
(
IDim
)
=
f
(
x
[
IDim
],
y
[
IDim
]);
}
};
// Array = Array + Array
template
<
typename
TData
,
index_t
NSize
>
__host__
__device__
constexpr
auto
operator
+
(
Array
<
TData
,
NSize
>
a
,
Array
<
TData
,
NSize
>
b
)
{
Array
<
TData
,
NSize
>
result
;
auto
f
=
math
::
plus
<
index_t
>
{};
static_for
<
0
,
NSize
,
1
>
{}(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
f
,
a
,
b
,
result
));
return
result
;
}
// Array = Array - Array
template
<
typename
TData
,
index_t
NSize
>
__host__
__device__
constexpr
auto
operator
-
(
Array
<
TData
,
NSize
>
a
,
Array
<
TData
,
NSize
>
b
)
{
Array
<
TData
,
NSize
>
result
;
auto
f
=
math
::
minus
<
index_t
>
{};
static_for
<
0
,
NSize
,
1
>
{}(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
f
,
a
,
b
,
result
));
return
result
;
}
// Array += Array
template
<
typename
TData
,
index_t
NSize
>
__host__
__device__
constexpr
auto
operator
+=
(
Array
<
TData
,
NSize
>&
a
,
Array
<
TData
,
NSize
>
b
)
{
a
=
a
+
b
;
return
a
;
}
// Array -= Array
template
<
typename
TData
,
index_t
NSize
>
__host__
__device__
constexpr
auto
operator
-=
(
Array
<
TData
,
NSize
>&
a
,
Array
<
TData
,
NSize
>
b
)
{
a
=
a
-
b
;
return
a
;
}
// Array = Array + Sequence
template
<
typename
TData
,
index_t
NSize
,
index_t
...
Is
>
__host__
__device__
constexpr
auto
operator
+
(
Array
<
TData
,
NSize
>
a
,
Sequence
<
Is
...
>
b
)
{
static_assert
(
sizeof
...(
Is
)
==
NSize
,
"wrong! size not the same"
);
Array
<
TData
,
NSize
>
result
;
auto
f
=
math
::
plus
<
index_t
>
{};
static_for
<
0
,
NSize
,
1
>
{}(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
f
,
a
,
b
,
result
));
return
result
;
}
// Array = Array - Sequence
template
<
typename
TData
,
index_t
NSize
,
index_t
...
Is
>
__host__
__device__
constexpr
auto
operator
-
(
Array
<
TData
,
NSize
>
a
,
Sequence
<
Is
...
>
b
)
{
static_assert
(
sizeof
...(
Is
)
==
NSize
,
"wrong! size not the same"
);
Array
<
TData
,
NSize
>
result
;
auto
f
=
math
::
minus
<
index_t
>
{};
static_for
<
0
,
NSize
,
1
>
{}(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
f
,
a
,
b
,
result
));
return
result
;
}
// Array = Array * Sequence
template
<
typename
TData
,
index_t
NSize
,
index_t
...
Is
>
__host__
__device__
constexpr
auto
operator
*
(
Array
<
TData
,
NSize
>
a
,
Sequence
<
Is
...
>
b
)
{
static_assert
(
sizeof
...(
Is
)
==
NSize
,
"wrong! size not the same"
);
Array
<
TData
,
NSize
>
result
;
auto
f
=
math
::
multiplies
<
index_t
>
{};
static_for
<
0
,
NSize
,
1
>
{}(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
f
,
a
,
b
,
result
));
return
result
;
}
// Array = Sequence - Array
template
<
typename
TData
,
index_t
NSize
,
index_t
...
Is
>
__host__
__device__
constexpr
auto
operator
-
(
Sequence
<
Is
...
>
a
,
Array
<
TData
,
NSize
>
b
)
{
static_assert
(
sizeof
...(
Is
)
==
NSize
,
"wrong! size not the same"
);
Array
<
TData
,
NSize
>
result
;
auto
f
=
math
::
minus
<
index_t
>
{};
static_for
<
0
,
NSize
,
1
>
{}(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
f
,
a
,
b
,
result
));
return
result
;
}
// Array = Array * TData
template
<
typename
TData
,
index_t
NSize
>
__host__
__device__
constexpr
auto
operator
*
(
TData
v
,
Array
<
TData
,
NSize
>
a
)
{
Array
<
TData
,
NSize
>
result
;
for
(
index_t
i
=
0
;
i
<
NSize
;
++
i
)
{
result
(
i
)
=
a
[
i
]
*
v
;
}
return
result
;
}
template
<
typename
TData
,
index_t
NSize
,
typename
Reduce
>
__host__
__device__
constexpr
TData
accumulate_on_array
(
const
Array
<
TData
,
NSize
>&
a
,
Reduce
f
,
TData
init
)
{
TData
result
=
init
;
static_assert
(
NSize
>
0
,
"wrong"
);
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
result
=
f
(
result
,
a
[
I
]);
});
return
result
;
}
template
<
typename
TData
,
index_t
NSize
,
typename
Reduce
>
__host__
__device__
constexpr
auto
reverse_inclusive_scan_on_array
(
const
Array
<
TData
,
NSize
>&
x
,
Reduce
f
,
TData
init
)
{
Array
<
TData
,
NSize
>
y
;
TData
r
=
init
;
#pragma unroll
for
(
index_t
i
=
NSize
-
1
;
i
>=
0
;
--
i
)
{
r
=
f
(
r
,
x
[
i
]);
y
(
i
)
=
r
;
}
return
y
;
}
template
<
typename
TData
,
index_t
NSize
,
typename
Reduce
>
__host__
__device__
constexpr
auto
reverse_exclusive_scan_on_array
(
const
Array
<
TData
,
NSize
>&
x
,
Reduce
f
,
TData
init
)
{
Array
<
TData
,
NSize
>
y
;
TData
r
=
init
;
#pragma unroll
for
(
index_t
i
=
NSize
-
1
;
i
>
0
;
--
i
)
{
y
(
i
)
=
r
;
r
=
f
(
r
,
x
[
i
]);
}
y
(
i
)
=
r
;
return
y
;
}
}
// namespace ck
#endif
driver/src/conv_bwd_data_driver.cpp
View file @
4388f572
...
@@ -197,7 +197,7 @@ int main(int argc, char* argv[])
...
@@ -197,7 +197,7 @@ int main(int argc, char* argv[])
constexpr
index_t
X
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
2
,
2
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
...
@@ -244,7 +244,7 @@ int main(int argc, char* argv[])
...
@@ -244,7 +244,7 @@ int main(int argc, char* argv[])
#endif
#endif
}
}
#if
1
#if
0
device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw
device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw
#elif
0
#elif
0
device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw
device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw
...
@@ -253,17 +253,17 @@ int main(int argc, char* argv[])
...
@@ -253,17 +253,17 @@ int main(int argc, char* argv[])
#elif 1
#elif 1
device_convolution_backward_data_implicit_gemm_v5r1_nhwc_kyxc_nhwk
device_convolution_backward_data_implicit_gemm_v5r1_nhwc_kyxc_nhwk
#endif
#endif
(
in_nchw_desc
,
(
in_nchw_desc
,
in_nchw_device
,
in_nchw_device
,
wei_kcyx_desc
,
wei_kcyx_desc
,
wei_kcyx
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_desc
,
out_nkhw
,
out_nkhw
,
ConvStrides
{},
ConvStrides
{},
ConvDilations
{},
ConvDilations
{},
LeftPads
{},
LeftPads
{},
RightPads
{},
RightPads
{},
nrepeat
);
nrepeat
);
if
(
do_verification
)
if
(
do_verification
)
{
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment