Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
f1541994
Commit
f1541994
authored
Aug 18, 2020
by
Chao Liu
Browse files
change clang-format to 5.0
parent
863e069b
Changes
12
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
189 additions
and
191 deletions
+189
-191
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
...rnel/include/kernel_algorithm/dummy_dynamic_transform.hpp
+7
-7
composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer.hpp
...a_implicit_gemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer.hpp
+1
-1
composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
.../include/tensor_description/dynamic_tensor_descriptor.hpp
+7
-8
composable_kernel/include/tensor_description/tensor_coordinate.hpp
...e_kernel/include/tensor_description/tensor_coordinate.hpp
+2
-2
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
...l/include/tensor_description/tensor_descriptor_helper.hpp
+6
-6
composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
...ernel/include/tensor_operation/blockwise_batched_gemm.hpp
+23
-26
composable_kernel/include/tensor_operation/blockwise_gemm.hpp
...osable_kernel/include/tensor_operation/blockwise_gemm.hpp
+3
-3
composable_kernel/include/tensor_operation/threadwise_gemm.hpp
...sable_kernel/include/tensor_operation/threadwise_gemm.hpp
+2
-3
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+87
-85
driver/include/device_dummy_dynamic_transform.hpp
driver/include/device_dummy_dynamic_transform.hpp
+1
-1
driver/include/host_conv.hpp
driver/include/host_conv.hpp
+1
-1
external/half/include/half.hpp
external/half/include/half.hpp
+49
-48
No files found.
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
View file @
f1541994
composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
f1541994
composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
View file @
f1541994
...
...
@@ -316,8 +316,7 @@ struct DynamicTransformedTensorDescriptor
constexpr
bool
is_valid_up_always_mapped_to_valid_low
=
decltype
(
tran
)
::
IsValidUpperIndexAlwaysMappedToValidLowerIndex
();
if
constexpr
(
!
is_valid_up_always_mapped_to_valid_low
)
if
constexpr
(
!
is_valid_up_always_mapped_to_valid_low
)
{
const
auto
up_dims_part
=
UpDimensionIds
{}.
At
(
itran
);
const
auto
idx_up_part
=
pick_array_element
(
idx_up
,
up_dims_part
);
...
...
composable_kernel/include/tensor_description/tensor_coordinate.hpp
View file @
f1541994
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
View file @
f1541994
...
...
@@ -64,7 +64,7 @@ template <typename LowerTensorDescriptor,
index_t
...
LowerDimensionIds
,
index_t
...
UpperDimensionIds
>
__host__
__device__
constexpr
auto
reorder_transformed_tensor_descriptor_impl
(
LowerTensorDescriptor
,
reorder_transformed_tensor_descriptor_impl
(
LowerTensorDescriptor
,
Sequence
<
LowerLengths
...
>
,
Sequence
<
LowerDimensionIds
...
>
,
Sequence
<
UpperDimensionIds
...
>
)
...
...
@@ -78,7 +78,7 @@ __host__ __device__ constexpr auto
// reorder a NativeTensorDescriptor
template
<
typename
...
Ts
,
typename
MapLower2Upper
>
__host__
__device__
constexpr
auto
reorder_tensor_descriptor_given_lower2upper
(
NativeTensorDescriptor
<
Ts
...
>
,
MapLower2Upper
)
reorder_tensor_descriptor_given_lower2upper
(
NativeTensorDescriptor
<
Ts
...
>
,
MapLower2Upper
)
{
static_assert
(
is_valid_sequence_map
<
MapLower2Upper
>
{},
"wrong! MapLower2Upper is not a valid map"
);
...
...
@@ -96,7 +96,7 @@ __host__ __device__ constexpr auto
// reorder a TransformedTensorDescriptor
template
<
typename
...
Ts
,
typename
MapLower2Upper
>
__host__
__device__
constexpr
auto
reorder_tensor_descriptor_given_lower2upper
(
TransformedTensorDescriptor
<
Ts
...
>
,
MapLower2Upper
)
reorder_tensor_descriptor_given_lower2upper
(
TransformedTensorDescriptor
<
Ts
...
>
,
MapLower2Upper
)
{
static_assert
(
is_valid_sequence_map
<
MapLower2Upper
>
{},
"wrong! MapLower2Upper is not a valid map"
);
...
...
composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
View file @
f1541994
...
...
@@ -210,15 +210,14 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
#pragma unroll
for
(
index_t
m_repeat
=
0
;
m_repeat
<
MRepeat
;
++
m_repeat
)
{
threadwise_matrix_copy
(
a_block_mtx
,
threadwise_matrix_copy
(
a_block_mtx
,
p_a_block
+
a_block_mtx
.
GetOffsetFromMultiIndex
(
k_begin
,
m_repeat
*
MPerLevel1Cluster
)
+
a_block_mtx
.
GetOffsetFromMultiIndex
(
k_begin
,
m_repeat
*
MPerLevel1Cluster
)
+
ib
*
BlockMatrixStrideA
+
mMyThreadOffsetA
,
a_thread_mtx
,
p_a_thread
+
a_thread_mtx
.
GetOffsetFromMultiIndex
(
0
,
m_repeat
*
MPerThreadSubC
),
p_a_thread
+
a_thread_mtx
.
GetOffsetFromMultiIndex
(
0
,
m_repeat
*
MPerThreadSubC
),
a_thread_sub_mtx
.
GetLengths
(),
Number
<
DataPerReadA
>
{});
}
...
...
@@ -229,15 +228,14 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
#pragma unroll
for
(
index_t
n_repeat
=
0
;
n_repeat
<
NRepeat
;
++
n_repeat
)
{
threadwise_matrix_copy
(
b_block_mtx
,
threadwise_matrix_copy
(
b_block_mtx
,
p_b_block
+
b_block_mtx
.
GetOffsetFromMultiIndex
(
k_begin
,
n_repeat
*
NPerLevel1Cluster
)
+
b_block_mtx
.
GetOffsetFromMultiIndex
(
k_begin
,
n_repeat
*
NPerLevel1Cluster
)
+
ib
*
BlockMatrixStrideB
+
mMyThreadOffsetB
,
b_thread_mtx
,
p_b_thread
+
b_thread_mtx
.
GetOffsetFromMultiIndex
(
0
,
n_repeat
*
NPerThreadSubC
),
p_b_thread
+
b_thread_mtx
.
GetOffsetFromMultiIndex
(
0
,
n_repeat
*
NPerThreadSubC
),
b_thread_sub_mtx
.
GetLengths
(),
Number
<
DataPerReadB
>
{});
}
...
...
@@ -391,9 +389,8 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
{
threadwise_matrix_copy
(
c_thread_sub_mtx
,
p_c_thread
+
c_thread_sub_mtx
.
GetOffsetFromMultiIndex
(
m_repeat
*
MPerLevel1Cluster
,
n_repeat
*
NPerLevel1Cluster
),
p_c_thread
+
c_thread_sub_mtx
.
GetOffsetFromMultiIndex
(
m_repeat
*
MPerLevel1Cluster
,
n_repeat
*
NPerLevel1Cluster
),
c_block_mtx
,
p_c_block
+
c_block_mtx
.
GetOffsetFromMultiIndex
(
m_repeat
*
MPerLevel1Cluster
,
...
...
@@ -405,5 +402,5 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
}
};
}
// namespace
}
// namespace
ck
#endif
composable_kernel/include/tensor_operation/blockwise_gemm.hpp
View file @
f1541994
...
...
@@ -336,9 +336,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
constexpr
index_t
MRepeat
=
MPerThread
/
MPerThreadSubC
;
constexpr
index_t
NRepeat
=
NPerThread
/
NPerThreadSubC
;
static_if
<
MRepeat
==
2
&&
NRepeat
==
2
>
{}(
[
&
](
auto
)
{
Run_pipelined_2x2
(
p_a_block
,
p_b_block
,
p_c_thread
);
})
.
Else
([
&
](
auto
)
{
Run_naive
(
p_a_block
,
p_b_block
,
p_c_thread
);
});
static_if
<
MRepeat
==
2
&&
NRepeat
==
2
>
{}(
[
&
](
auto
)
{
Run_pipelined_2x2
(
p_a_block
,
p_b_block
,
p_c_thread
);
})
.
Else
([
&
](
auto
)
{
Run_naive
(
p_a_block
,
p_b_block
,
p_c_thread
);
});
#else
Run_naive
(
p_a_block
,
p_b_block
,
p_c_thread
);
#endif
...
...
composable_kernel/include/tensor_operation/threadwise_gemm.hpp
View file @
f1541994
...
...
@@ -153,9 +153,8 @@ struct ThreadwiseGemmTransANormalBNormalC
(
is_same
<
FloatA
,
half2_t
>
{}
&&
is_same
<
FloatB
,
half2_t
>
{})
||
(
is_same
<
FloatA
,
half4_t
>
{}
&&
is_same
<
FloatB
,
half4_t
>
{}));
static_if
<
has_amd_asm
>
{}([
&
](
auto
fwd
)
{
Run_amd_asm
(
p_a
,
p_b
,
fwd
(
p_c
));
}).
Else
([
&
](
auto
)
{
Run_source
(
p_a
,
p_b
,
p_c
);
});
static_if
<
has_amd_asm
>
{}([
&
](
auto
fwd
)
{
Run_amd_asm
(
p_a
,
p_b
,
fwd
(
p_c
));
})
.
Else
([
&
](
auto
)
{
Run_source
(
p_a
,
p_b
,
p_c
);
});
#else
Run_source
(
p_a
,
p_b
,
p_c
);
#endif
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
f1541994
...
...
@@ -82,8 +82,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
constexpr
auto
long_vector_access_lengths
=
SliceLengths
::
Modify
(
vector_access_dim
,
SliceLengths
::
Get
(
vector_access_dim
)
/
long_vector_size
);
ford
<
decltype
(
long_vector_access_lengths
),
SrcDstDimAccessOrder
>
{}(
[
&
](
auto
long_vector_access_id
)
{
ford
<
decltype
(
long_vector_access_lengths
),
SrcDstDimAccessOrder
>
{}(
[
&
](
auto
long_vector_access_id
)
{
// data id w.r.t slicing-window
auto
long_vector_data_begin_id
=
long_vector_access_id
;
...
...
@@ -109,9 +109,11 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
const
index_t
buffer_offset
=
i
*
src_data_per_access
;
const
auto
src_coord
=
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
);
const
auto
src_coord
=
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
);
// Check src data's valid mapping situation, only check the first data in this src
// Check src data's valid mapping situation, only check the first data in this
// src
// vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation
transfer_data
<
SrcData
,
...
...
@@ -146,9 +148,11 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
const
index_t
buffer_offset
=
i
*
dst_data_per_access
;
const
auto
dst_coord
=
mDstSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
);
const
auto
dst_coord
=
mDstSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
);
// Check dst data's valid mapping situation, only check the first data in this dst
// Check dst data's valid mapping situation, only check the first data in this
// dst
// vector. It's user's responsiblity to make sure all data in the dst vector
// has the valid/invalid mapping situation
transfer_data
<
DstData
,
...
...
@@ -175,9 +179,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
{
const
auto
step_sizes
=
to_array
(
step_sizes_
);
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mSrcSliceOrigin
+=
to_array
(
step_sizes
);
}).
Else
([
&
](
auto
)
{
mSrcSliceOrigin
-=
step_sizes
;
});
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mSrcSliceOrigin
+=
to_array
(
step_sizes
);
})
.
Else
([
&
](
auto
)
{
mSrcSliceOrigin
-=
step_sizes
;
});
}
template
<
typename
T
,
bool
PositiveDirection
>
...
...
@@ -186,9 +189,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
{
const
auto
step_sizes
=
to_array
(
step_sizes_
);
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mDstSliceOrigin
+=
step_sizes
;
}).
Else
([
&
](
auto
)
{
mDstSliceOrigin
-=
step_sizes
;
});
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mDstSliceOrigin
+=
step_sizes
;
})
.
Else
([
&
](
auto
)
{
mDstSliceOrigin
-=
step_sizes
;
});
}
private:
...
...
driver/include/device_dummy_dynamic_transform.hpp
View file @
f1541994
driver/include/host_conv.hpp
View file @
f1541994
external/half/include/half.hpp
View file @
f1541994
...
...
@@ -508,8 +508,8 @@ template <bool B>
struct bool_type : std::integral_constant<bool, B>
{
};
using
std
::
true_type
;
using std::false_type;
using std::true_type;
/// Type traits for floating-point types.
template <typename T>
...
...
@@ -854,8 +854,8 @@ inline HALF_CONSTEXPR_NOERR unsigned int signal(unsigned int x, unsigned int y,
((x & 0x7FFF) > 0x7C00 && !(x & 0x200)) || ((y & 0x7FFF) > 0x7C00 && !(y & 0x200)) ||
((z & 0x7FFF) > 0x7C00 && !(z & 0x200)));
#endif
return
((
x
&
0x7FFF
)
>
0x7C00
)
?
(
x
|
0x200
)
:
((
y
&
0x7FFF
)
>
0x7C00
)
?
(
y
|
0x200
)
:
(
z
|
0x200
);
return ((x & 0x7FFF) > 0x7C00) ? (x | 0x200)
: ((y & 0x7FFF) > 0x7C00) ? (y | 0x200)
: (z | 0x200);
}
/// Select value or signaling NaN.
...
...
@@ -1756,9 +1756,9 @@ uint32 mulhi(uint32 x, uint32 y)
uint32 xy = (x >> 16) * (y & 0xFFFF), yx = (x & 0xFFFF) * (y >> 16),
c = (xy & 0xFFFF) + (yx & 0xFFFF) + (((x & 0xFFFF) * (y & 0xFFFF)) >> 16);
return (x >> 16) * (y >> 16) + (xy >> 16) + (yx >> 16) + (c >> 16) +
((
R
==
std
::
round_to_nearest
)
?
((
c
>>
15
)
&
1
)
:
(
R
==
std
::
round_toward_infinity
)
?
((
c
&
0xFFFF
)
!=
0
)
:
0
);
((R == std::round_to_nearest)
? ((c
>> 15) & 1
)
: (R == std::round_toward_infinity) ? ((c & 0xFFFF) != 0)
: 0);
}
/// 64-bit multiplication.
...
...
@@ -2379,10 +2379,12 @@ unsigned int erf(unsigned int arg)
t /
((x2.exp < 0) ? f31(exp2((x2.exp > -32) ? (x2.m >> -x2.exp) : 0, 30), 0)
: f31(exp2((x2.m << x2.exp) & 0x7FFFFFFF, 22), x2.m >> (31 - x2.exp)));
return
(
!
C
||
sign
)
?
fixed2half
<
R
,
31
,
false
,
true
,
true
>
(
return (!C || sign)
? fixed2half<R, 31, false, true, true>(
0x80000000 - (e.m >> (C - e.exp)), 14 + C, sign & (C - 1U))
:
(
e
.
exp
<
-
25
)
?
underflow
<
R
>
()
:
fixed2half
<
R
,
30
,
false
,
false
,
true
>
(
e
.
m
>>
1
,
e
.
exp
+
14
,
0
,
e
.
m
&
1
);
: (e.exp < -25)
? underflow<R>()
: fixed2half<R, 30, false, false, true>(e.m >> 1, e.exp + 14, 0, e.m & 1);
}
/// Gamma function and postprocessing.
...
...
@@ -2402,8 +2404,7 @@ unsigned int gamma(unsigned int arg)
for(unsigned int i=0; i<5; ++i)
s += p[i+1] / (arg+i);
return std::log(s) + (arg-0.5)*std::log(t) - t;
*/
static
const
f31
pi
(
0xC90FDAA2
,
1
),
*/ static const f31 pi(0xC90FDAA2, 1),
lbe(0xB8AA3B29, 0);
unsigned int abs = arg & 0x7FFF, sign = arg & 0x8000;
bool bsign = sign != 0;
...
...
@@ -2789,7 +2790,7 @@ inline half operator"" _h(long double value)
{
return half(detail::binary, detail::float2half<half::round_style>(value));
}
}
}
// namespace literal
#endif
namespace detail {
...
...
@@ -2837,8 +2838,8 @@ struct half_caster<half, half, R>
{
static half cast(half arg) { return arg; }
};
}
}
}
// namespace detail
}
// namespace half_float
/// Extensions to the C++ standard library.
namespace std {
...
...
@@ -3003,7 +3004,7 @@ struct hash<half_float::half>
}
};
#endif
}
}
// namespace std
namespace half_float {
/// \anchor compop
...
...
@@ -3122,10 +3123,11 @@ inline half operator+(half x, half y)
return half(detail::binary,
(absx > 0x7C00 || absy > 0x7C00)
? detail::signal(x.data_, y.data_)
:
(
absy
!=
0x7C00
)
?
x
.
data_
:
(
sub
&&
absx
==
0x7C00
)
?
detail
::
invalid
()
:
y
.
data_
);
: (absy != 0x7C00) ? x.data_
: (sub && absx == 0x7C00) ? detail::invalid()
: y.data_);
if(!absx)
return
absy
?
y
:
half
(
detail
::
binary
,
return absy ? y
: half(detail::binary,
(half::round_style == std::round_toward_neg_infinity)
? (x.data_ | y.data_)
: (x.data_ & y.data_));
...
...
@@ -3449,9 +3451,10 @@ inline half fma(half x, half y, half z)
: (sign | 0x7C00))
: z;
if(!absx || !absy)
return
absz
?
z
:
half
(
detail
::
binary
,
(
half
::
round_style
==
std
::
round_toward_neg_infinity
)
?
(
z
.
data_
|
sign
)
return absz
? z
: half(detail::binary,
(half::round_style == std::round_toward_neg_infinity) ? (z.data_ | sign)
: (z.data_ & sign));
for(; absx < 0x400; absx <<= 1, --exp)
;
...
...
@@ -3516,8 +3519,7 @@ inline half fma(half x, half y, half z)
inline HALF_CONSTEXPR_NOERR half fmax(half x, half y)
{
return half(detail::binary,
(
!
isnan
(
y
)
&&
(
isnan
(
x
)
||
(
x
.
data_
^
(
0x8000
|
(
0x8000
-
(
x
.
data_
>>
15
))))
<
(!isnan(y) && (isnan(x) || (x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) <
(y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15))))))
? detail::select(y.data_, x.data_)
: detail::select(x.data_, y.data_));
...
...
@@ -3533,8 +3535,7 @@ inline HALF_CONSTEXPR_NOERR half fmax(half x, half y)
inline HALF_CONSTEXPR_NOERR half fmin(half x, half y)
{
return half(detail::binary,
(
!
isnan
(
y
)
&&
(
isnan
(
x
)
||
(
x
.
data_
^
(
0x8000
|
(
0x8000
-
(
x
.
data_
>>
15
))))
>
(!isnan(y) && (isnan(x) || (x.data_ ^ (0x8000 | (0x8000 - (x.data_ >> 15)))) >
(y.data_ ^ (0x8000 | (0x8000 - (y.data_ >> 15))))))
? detail::select(y.data_, x.data_)
: detail::select(x.data_, y.data_));
...
...
@@ -3886,9 +3887,9 @@ inline half log1p(half arg)
#else
if(arg.data_ >= 0xBC00)
return half(detail::binary,
(
arg
.
data_
==
0xBC00
)
?
detail
::
pole
(
0x8000
)
:
(
arg
.
data_
<=
0xFC00
)
?
detail
::
invalid
(
)
:
detail
::
signal
(
arg
.
data_
));
(arg.data_ == 0xBC00)
? detail::
pole(0x8000
)
: (arg.data_ <= 0xFC00) ? detail::invalid()
: detail::signal(arg.data_));
int abs = arg.data_ & 0x7FFF, exp = -15;
if(!abs || abs >= 0x7C00)
return (abs > 0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
...
...
@@ -5354,13 +5355,13 @@ inline HALF_CONSTEXPR half copysign(half x, half y)
/// \retval FP_NORMAL for all other (normal) values
inline HALF_CONSTEXPR int fpclassify(half arg)
{
return
!
(
arg
.
data_
&
0x7FFF
)
?
FP_ZERO
:
((
arg
.
data_
&
0x7FFF
)
<
0x400
)
return !(arg.data_ & 0x7FFF)
? FP_ZERO
: ((arg.data_ & 0x7FFF) < 0x400)
? FP_SUBNORMAL
: ((arg.data_ & 0x7FFF) < 0x7C00)
? FP_NORMAL
:
((
arg
.
data_
&
0x7FFF
)
==
0x7C00
)
?
FP_INFINITE
:
FP_NAN
;
: ((arg.data_ & 0x7FFF) == 0x7C00) ? FP_INFINITE : FP_NAN;
}
/// Check if finite number.
...
...
@@ -5652,7 +5653,7 @@ inline void fethrowexcept(int excepts, const char* msg = "")
throw std::range_error(msg);
}
/// \}
}
}
// namespace half_float
#undef HALF_UNUSED_NOERR
#undef HALF_CONSTEXPR
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment