Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
a3b4c5cb
Commit
a3b4c5cb
authored
Jun 03, 2022
by
wangshaojie6
Browse files
merge develop branch and add gridwise pipeline v3
parents
48918ab9
1677cf70
Changes
361
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
624 additions
and
411 deletions
+624
-411
include/ck/utility/common_header.hpp
include/ck/utility/common_header.hpp
+2
-1
include/ck/utility/dynamic_buffer.hpp
include/ck/utility/dynamic_buffer.hpp
+41
-1
include/ck/utility/generic_memory_space_atomic.hpp
include/ck/utility/generic_memory_space_atomic.hpp
+120
-0
include/ck/utility/get_id.hpp
include/ck/utility/get_id.hpp
+10
-2
include/ck/utility/inner_product.hpp
include/ck/utility/inner_product.hpp
+2
-5
include/ck/utility/math_v2.hpp
include/ck/utility/math_v2.hpp
+60
-10
include/ck/utility/number.hpp
include/ck/utility/number.hpp
+3
-0
include/ck/utility/reduction_functions_accumulate.hpp
include/ck/utility/reduction_functions_accumulate.hpp
+14
-21
include/ck/utility/reduction_operator.hpp
include/ck/utility/reduction_operator.hpp
+55
-14
include/ck/utility/static_buffer.hpp
include/ck/utility/static_buffer.hpp
+13
-2
include/ck/utility/statically_indexed_array_multi_index.hpp
include/ck/utility/statically_indexed_array_multi_index.hpp
+7
-0
include/ck/utility/thread_group.hpp
include/ck/utility/thread_group.hpp
+18
-0
include/ck/utility/tuple.hpp
include/ck/utility/tuple.hpp
+5
-6
include/ck/utility/type.hpp
include/ck/utility/type.hpp
+3
-0
library/include/ck/library/host/host_interface.hpp
library/include/ck/library/host/host_interface.hpp
+54
-0
library/include/ck/library/host_tensor/device.hpp
library/include/ck/library/host_tensor/device.hpp
+75
-36
library/include/ck/library/host_tensor/host_common_util.hpp
library/include/ck/library/host_tensor/host_common_util.hpp
+102
-0
library/include/ck/library/host_tensor/host_reduce_util.hpp
library/include/ck/library/host_tensor/host_reduce_util.hpp
+0
-269
library/include/ck/library/host_tensor/host_reduction.hpp
library/include/ck/library/host_tensor/host_reduction.hpp
+38
-42
library/include/ck/library/host_tensor/host_tensor.hpp
library/include/ck/library/host_tensor/host_tensor.hpp
+2
-2
No files found.
include/ck/utility/common_header.hpp
View file @
a3b4c5cb
...
@@ -28,10 +28,11 @@
...
@@ -28,10 +28,11 @@
#include "transpose_vectors.hpp"
#include "transpose_vectors.hpp"
#include "inner_product.hpp"
#include "inner_product.hpp"
#include "element_wise_operation.hpp"
#include "element_wise_operation.hpp"
#include "thread_group.hpp"
#include "debug.hpp"
#include "debug.hpp"
#include "amd_buffer_addressing.hpp"
#include "amd_buffer_addressing.hpp"
#include "generic_memory_space_atomic
_add
.hpp"
#include "generic_memory_space_atomic.hpp"
#include "get_id.hpp"
#include "get_id.hpp"
#include "synchronization.hpp"
#include "synchronization.hpp"
#include "amd_address_space.hpp"
#include "amd_address_space.hpp"
...
...
include/ck/utility/dynamic_buffer.hpp
View file @
a3b4c5cb
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "enable_if.hpp"
#include "enable_if.hpp"
#include "c_style_pointer_cast.hpp"
#include "c_style_pointer_cast.hpp"
#include "amd_buffer_addressing.hpp"
#include "amd_buffer_addressing.hpp"
#include "generic_memory_space_atomic
_add
.hpp"
#include "generic_memory_space_atomic.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -125,6 +125,10 @@ struct DynamicBuffer
...
@@ -125,6 +125,10 @@ struct DynamicBuffer
{
{
this
->
template
AtomicAdd
<
X
>(
i
,
is_valid_element
,
x
);
this
->
template
AtomicAdd
<
X
>(
i
,
is_valid_element
,
x
);
}
}
else
if
constexpr
(
Op
==
InMemoryDataOperationEnum
::
AtomicMax
)
{
this
->
template
AtomicMax
<
X
>(
i
,
is_valid_element
,
x
);
}
else
if
constexpr
(
Op
==
InMemoryDataOperationEnum
::
Add
)
else
if
constexpr
(
Op
==
InMemoryDataOperationEnum
::
Add
)
{
{
auto
tmp
=
this
->
template
Get
<
X
>(
i
,
is_valid_element
);
auto
tmp
=
this
->
template
Get
<
X
>(
i
,
is_valid_element
);
...
@@ -326,6 +330,42 @@ struct DynamicBuffer
...
@@ -326,6 +330,42 @@ struct DynamicBuffer
}
}
}
}
template
<
typename
X
,
typename
enable_if
<
is_same
<
typename
scalar_type
<
remove_cvref_t
<
X
>
>::
type
,
typename
scalar_type
<
remove_cvref_t
<
T
>>::
type
>::
value
,
bool
>::
type
=
false
>
__host__
__device__
void
AtomicMax
(
index_t
i
,
bool
is_valid_element
,
const
X
&
x
)
{
// X contains multiple T
constexpr
index_t
scalar_per_t_vector
=
scalar_type
<
remove_cvref_t
<
T
>>::
vector_size
;
constexpr
index_t
scalar_per_x_vector
=
scalar_type
<
remove_cvref_t
<
X
>>::
vector_size
;
static_assert
(
scalar_per_x_vector
%
scalar_per_t_vector
==
0
,
"wrong! X should contain multiple T"
);
static_assert
(
GetAddressSpace
()
==
AddressSpaceEnum
::
Global
,
"only support global mem"
);
#if CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64
using
scalar_t
=
typename
scalar_type
<
remove_cvref_t
<
T
>>::
type
;
bool
constexpr
use_amd_buffer_addressing
=
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
double
>
;
#else
bool
constexpr
use_amd_buffer_addressing
=
false
;
#endif
if
constexpr
(
use_amd_buffer_addressing
)
{
constexpr
index_t
t_per_x
=
scalar_per_x_vector
/
scalar_per_t_vector
;
amd_buffer_atomic_max
<
remove_cvref_t
<
T
>
,
t_per_x
>
(
x
,
p_data_
,
i
,
is_valid_element
,
element_space_size_
);
}
else
if
(
is_valid_element
)
{
atomic_max
<
X
>
(
c_style_pointer_cast
<
X
*>
(
&
p_data_
[
i
]),
x
);
}
}
__host__
__device__
static
constexpr
bool
IsStaticBuffer
()
{
return
false
;
}
__host__
__device__
static
constexpr
bool
IsStaticBuffer
()
{
return
false
;
}
__host__
__device__
static
constexpr
bool
IsDynamicBuffer
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsDynamicBuffer
()
{
return
true
;
}
...
...
include/ck/utility/generic_memory_space_atomic
_add
.hpp
→
include/ck/utility/generic_memory_space_atomic.hpp
View file @
a3b4c5cb
...
@@ -3,6 +3,10 @@
...
@@ -3,6 +3,10 @@
namespace
ck
{
namespace
ck
{
// Caution: DO NOT REMOVE
// intentionally have only declaration but no definition to cause compilation failure when trying to
// instantiate this template. The purpose is to make the implementation of atomic_add explicit for
// each datatype.
template
<
typename
X
>
template
<
typename
X
>
__device__
X
atomic_add
(
X
*
p_dst
,
const
X
&
x
);
__device__
X
atomic_add
(
X
*
p_dst
,
const
X
&
x
);
...
@@ -24,6 +28,12 @@ __device__ float atomic_add<float>(float* p_dst, const float& x)
...
@@ -24,6 +28,12 @@ __device__ float atomic_add<float>(float* p_dst, const float& x)
return
atomicAdd
(
p_dst
,
x
);
return
atomicAdd
(
p_dst
,
x
);
}
}
template
<
>
__device__
double
atomic_add
<
double
>
(
double
*
p_dst
,
const
double
&
x
)
{
return
atomicAdd
(
p_dst
,
x
);
}
template
<
>
template
<
>
__device__
float2_t
atomic_add
<
float2_t
>
(
float2_t
*
p_dst
,
const
float2_t
&
x
)
__device__
float2_t
atomic_add
<
float2_t
>
(
float2_t
*
p_dst
,
const
float2_t
&
x
)
{
{
...
@@ -41,4 +51,70 @@ __device__ float2_t atomic_add<float2_t>(float2_t* p_dst, const float2_t& x)
...
@@ -41,4 +51,70 @@ __device__ float2_t atomic_add<float2_t>(float2_t* p_dst, const float2_t& x)
return
vy
.
template
AsType
<
float2_t
>()[
I0
];
return
vy
.
template
AsType
<
float2_t
>()[
I0
];
}
}
template
<
>
__device__
double2_t
atomic_add
<
double2_t
>
(
double2_t
*
p_dst
,
const
double2_t
&
x
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
const
vector_type
<
double
,
2
>
vx
{
x
};
vector_type
<
double
,
2
>
vy
{
0
};
vy
.
template
AsType
<
double
>()(
I0
)
=
atomicAdd
(
c_style_pointer_cast
<
double
*>
(
p_dst
),
vx
.
template
AsType
<
double
>()[
I0
]);
vy
.
template
AsType
<
double
>()(
I1
)
=
atomicAdd
(
c_style_pointer_cast
<
double
*>
(
p_dst
)
+
1
,
vx
.
template
AsType
<
double
>()[
I1
]);
return
vy
.
template
AsType
<
double2_t
>()[
I0
];
}
// Caution: DO NOT REMOVE
// intentionally have only declaration but no definition to cause compilation failure when trying to
// instantiate this template. The purpose is to make the implementation of atomic_max explicit for
// each datatype.
template
<
typename
X
>
__device__
X
atomic_max
(
X
*
p_dst
,
const
X
&
x
);
template
<
>
__device__
int32_t
atomic_max
<
int32_t
>
(
int32_t
*
p_dst
,
const
int32_t
&
x
)
{
return
atomicMax
(
p_dst
,
x
);
}
template
<
>
__device__
uint32_t
atomic_max
<
uint32_t
>
(
uint32_t
*
p_dst
,
const
uint32_t
&
x
)
{
return
atomicMax
(
p_dst
,
x
);
}
template
<
>
__device__
float
atomic_max
<
float
>
(
float
*
p_dst
,
const
float
&
x
)
{
return
atomicMax
(
p_dst
,
x
);
}
template
<
>
__device__
double
atomic_max
<
double
>
(
double
*
p_dst
,
const
double
&
x
)
{
return
atomicMax
(
p_dst
,
x
);
}
template
<
>
__device__
float2_t
atomic_max
<
float2_t
>
(
float2_t
*
p_dst
,
const
float2_t
&
x
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
const
vector_type
<
float
,
2
>
vx
{
x
};
vector_type
<
float
,
2
>
vy
{
0
};
vy
.
template
AsType
<
float
>()(
I0
)
=
atomicMax
(
c_style_pointer_cast
<
float
*>
(
p_dst
),
vx
.
template
AsType
<
float
>()[
I0
]);
vy
.
template
AsType
<
float
>()(
I1
)
=
atomicMax
(
c_style_pointer_cast
<
float
*>
(
p_dst
)
+
1
,
vx
.
template
AsType
<
float
>()[
I1
]);
return
vy
.
template
AsType
<
float2_t
>()[
I0
];
}
}
// namespace ck
}
// namespace ck
include/ck/utility/get_id.hpp
View file @
a3b4c5cb
...
@@ -3,14 +3,22 @@
...
@@ -3,14 +3,22 @@
namespace
ck
{
namespace
ck
{
__device__
constexpr
index_t
get_wave_size
()
{
return
CK_GPU_WAVE_SIZE
;
}
__host__
__device__
constexpr
index_t
get_warp_size
()
{
// warpSize is defined by HIP
return
warpSize
;
}
__device__
index_t
get_thread_local_1d_id
()
{
return
threadIdx
.
x
;
}
__device__
index_t
get_thread_local_1d_id
()
{
return
threadIdx
.
x
;
}
__device__
index_t
get_wave_local_1d_id
()
{
return
threadIdx
.
x
/
get_wave_size
();
}
__device__
index_t
get_thread_global_1d_id
()
{
return
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
}
__device__
index_t
get_warp_local_1d_id
()
{
return
threadIdx
.
x
/
get_warp_size
();
}
__device__
index_t
get_block_1d_id
()
{
return
blockIdx
.
x
;
}
__device__
index_t
get_block_1d_id
()
{
return
blockIdx
.
x
;
}
__device__
index_t
get_grid_size
()
{
return
gridDim
.
x
;
}
__device__
index_t
get_grid_size
()
{
return
gridDim
.
x
;
}
__device__
index_t
get_block_size
()
{
return
blockDim
.
x
;
}
}
// namespace ck
}
// namespace ck
include/ck/utility/inner_product.hpp
View file @
a3b4c5cb
#ifndef CK_INNER_PRODUCT_HPP
#pragma once
#define CK_INNER_PRODUCT_HPP
#include "data_type.hpp"
#include "data_type.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -138,7 +136,7 @@ template <>
...
@@ -138,7 +136,7 @@ template <>
__device__
void
__device__
void
inner_product
<
int8x4_t
,
int8x4_t
,
int32_t
>
(
const
int8x4_t
&
a
,
const
int8x4_t
&
b
,
int32_t
&
c
)
inner_product
<
int8x4_t
,
int8x4_t
,
int32_t
>
(
const
int8x4_t
&
a
,
const
int8x4_t
&
b
,
int32_t
&
c
)
{
{
#if defined(CK_USE_DOT4_I32_I8)
#if defined(CK_USE_
AMD_V_
DOT4_I32_I8)
#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
v_dot4_i32_i8 %0, %1, %2, %0
\n
\
v_dot4_i32_i8 %0, %1, %2, %0
\n
\
...
@@ -202,4 +200,3 @@ inner_product<int8x16_t, int8x16_t, int32_t>(const int8x16_t& a, const int8x16_t
...
@@ -202,4 +200,3 @@ inner_product<int8x16_t, int8x16_t, int32_t>(const int8x16_t& a, const int8x16_t
}
}
}
// namespace ck
}
// namespace ck
#endif
include/ck/utility/math_v2.hpp
View file @
a3b4c5cb
...
@@ -3,11 +3,13 @@
...
@@ -3,11 +3,13 @@
#include <cmath>
#include <cmath>
#include "data_type.hpp"
#include "data_type.hpp"
#include "
half
.hpp"
#include "
type
.hpp"
namespace
ck
{
namespace
ck
{
namespace
math
{
namespace
math
{
// math functions for the host, some are implemented by calling C++ std functions
static
inline
__host__
float
abs
(
float
x
)
{
return
std
::
abs
(
x
);
};
static
inline
__host__
float
abs
(
float
x
)
{
return
std
::
abs
(
x
);
};
static
inline
__host__
double
abs
(
double
x
)
{
return
std
::
abs
(
x
);
};
static
inline
__host__
double
abs
(
double
x
)
{
return
std
::
abs
(
x
);
};
...
@@ -28,26 +30,26 @@ static inline __host__ int32_t abs(int32_t x)
...
@@ -28,26 +30,26 @@ static inline __host__ int32_t abs(int32_t x)
static
inline
__host__
half_t
abs
(
half_t
x
)
static
inline
__host__
half_t
abs
(
half_t
x
)
{
{
half_float
::
half
xx
=
*
reinterpret_cast
<
half_float
::
half
*
>
(
&
x
);
uint16_t
xx
=
ck
::
bit_cast
<
uint16_t
>
(
x
);
half_float
::
half
abs_xx
=
half_float
::
abs
(
xx
)
;
uint16_t
abs_xx
=
xx
&
0x7fff
;
half_t
abs_x
=
*
reinterpre
t_cast
<
half_t
*
>
(
&
abs_xx
);
half_t
abs_x
=
ck
::
bi
t_cast
<
half_t
>
(
abs_xx
);
return
abs_x
;
return
abs_x
;
};
};
static
inline
__host__
float
isnan
(
float
x
)
{
return
std
::
isnan
(
x
);
};
static
inline
__host__
bool
isnan
(
float
x
)
{
return
std
::
isnan
(
x
);
};
static
inline
__host__
double
isnan
(
double
x
)
{
return
std
::
isnan
(
x
);
};
static
inline
__host__
bool
isnan
(
double
x
)
{
return
std
::
isnan
(
x
);
};
static
inline
__host__
int8_t
isnan
(
int8_t
x
)
static
inline
__host__
bool
isnan
(
int8_t
x
)
{
{
(
void
)
x
;
(
void
)
x
;
return
false
;
return
false
;
};
};
static
inline
__host__
int32_t
isnan
(
int32_t
x
)
static
inline
__host__
bool
isnan
(
int32_t
x
)
{
{
(
void
)
x
;
(
void
)
x
;
return
false
;
return
false
;
...
@@ -55,11 +57,59 @@ static inline __host__ int32_t isnan(int32_t x)
...
@@ -55,11 +57,59 @@ static inline __host__ int32_t isnan(int32_t x)
static
inline
__host__
bool
isnan
(
half_t
x
)
static
inline
__host__
bool
isnan
(
half_t
x
)
{
{
half_float
::
half
xx
=
*
reinterpret_cast
<
half_float
::
half
*>
(
&
x
);
uint16_t
xx
=
ck
::
bit_cast
<
uint16_t
>
(
x
);
return
(
xx
&
0x7FFF
)
>
0x7C00
;
};
static
inline
__host__
float
sqrt
(
float
x
)
{
return
std
::
sqrt
(
x
);
};
static
inline
__host__
double
sqrt
(
double
x
)
{
return
std
::
sqrt
(
x
);
};
// math functions for the HIP kernel, some are implemented by calling hip builtin functions
static
inline
__device__
float
abs
(
float
x
)
{
return
::
abs
(
x
);
};
static
inline
__device__
double
abs
(
double
x
)
{
return
::
abs
(
x
);
};
static
inline
__device__
int8_t
abs
(
int8_t
x
)
{
int8_t
sgn
=
x
>>
(
8
-
1
);
return
(
x
^
sgn
)
-
sgn
;
};
static
inline
__device__
int32_t
abs
(
int32_t
x
)
{
int32_t
sgn
=
x
>>
(
32
-
1
);
return
(
x
^
sgn
)
-
sgn
;
};
static
inline
__device__
half_t
abs
(
half_t
x
)
{
return
::
__habs
(
x
);
};
static
inline
__device__
bool
isnan
(
float
x
)
{
return
::
isnan
(
x
);
};
static
inline
__device__
bool
isnan
(
double
x
)
{
return
::
isnan
(
x
);
};
static
inline
__device__
bool
isnan
(
int8_t
x
)
{
(
void
)
x
;
return
false
;
};
return
half_float
::
isnan
(
xx
);
static
inline
__device__
bool
isnan
(
int32_t
x
)
{
(
void
)
x
;
return
false
;
};
};
static
inline
__device__
bool
isnan
(
half_t
x
)
{
return
::
__hisnan
(
x
);
};
static
inline
__device__
float
sqrt
(
float
x
)
{
return
::
sqrtf
(
x
);
};
static
inline
__device__
double
sqrt
(
double
x
)
{
return
::
sqrt
(
x
);
};
}
// namespace math
}
// namespace math
}
// namespace ck
}
// namespace ck
...
...
include/ck/utility/number.hpp
View file @
a3b4c5cb
...
@@ -8,5 +8,8 @@ namespace ck {
...
@@ -8,5 +8,8 @@ namespace ck {
template
<
index_t
N
>
template
<
index_t
N
>
using
Number
=
integral_constant
<
index_t
,
N
>
;
using
Number
=
integral_constant
<
index_t
,
N
>
;
template
<
index_t
N
>
using
LongNumber
=
integral_constant
<
long_index_t
,
N
>
;
}
// namespace ck
}
// namespace ck
#endif
#endif
include/ck/utility/reduction_functions_accumulate.hpp
View file @
a3b4c5cb
...
@@ -27,6 +27,7 @@
...
@@ -27,6 +27,7 @@
#define CK_REDUCTION_FUNCTIONS_BINOP_HPP
#define CK_REDUCTION_FUNCTIONS_BINOP_HPP
#include "data_type.hpp"
#include "data_type.hpp"
#include "math_v2.hpp"
#include "reduction_common.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_operator.hpp"
...
@@ -34,18 +35,6 @@
...
@@ -34,18 +35,6 @@
namespace
ck
{
namespace
ck
{
namespace
detail
{
namespace
detail
{
template
<
typename
T
>
static
inline
__device__
bool
is_nan
(
T
x
)
{
return
(
isnan
(
x
));
};
template
<
>
inline
__device__
bool
is_nan
<
half_t
>
(
half_t
x
)
{
return
(
__hisnan
(
x
));
};
template
<
bool
PropagateNan
,
typename
ReduceOperation
,
typename
AccDataType
>
template
<
bool
PropagateNan
,
typename
ReduceOperation
,
typename
AccDataType
>
struct
AccumulateWithNanCheck
;
struct
AccumulateWithNanCheck
;
...
@@ -53,7 +42,7 @@ template <typename ReduceOperation, typename AccDataType>
...
@@ -53,7 +42,7 @@ template <typename ReduceOperation, typename AccDataType>
struct
AccumulateWithNanCheck
<
false
,
ReduceOperation
,
AccDataType
>
struct
AccumulateWithNanCheck
<
false
,
ReduceOperation
,
AccDataType
>
{
{
// cppcheck-suppress constParameter
// cppcheck-suppress constParameter
__device__
static
inline
void
Calculate
(
AccDataType
&
accuVal
,
AccDataType
currVal
)
__host__
__device__
static
inline
void
Calculate
(
AccDataType
&
accuVal
,
AccDataType
currVal
)
{
{
ReduceOperation
{}(
accuVal
,
currVal
);
ReduceOperation
{}(
accuVal
,
currVal
);
};
};
...
@@ -62,9 +51,11 @@ struct AccumulateWithNanCheck<false, ReduceOperation, AccDataType>
...
@@ -62,9 +51,11 @@ struct AccumulateWithNanCheck<false, ReduceOperation, AccDataType>
template
<
typename
ReduceOperation
,
typename
AccDataType
>
template
<
typename
ReduceOperation
,
typename
AccDataType
>
struct
AccumulateWithNanCheck
<
true
,
ReduceOperation
,
AccDataType
>
struct
AccumulateWithNanCheck
<
true
,
ReduceOperation
,
AccDataType
>
{
{
__device__
static
inline
void
Calculate
(
AccDataType
&
accuVal
,
AccDataType
currVal
)
__host__
__device__
static
inline
void
Calculate
(
AccDataType
&
accuVal
,
AccDataType
currVal
)
{
{
if
(
is_nan
(
currVal
))
using
ck
::
math
::
isnan
;
if
(
isnan
(
currVal
))
{
{
accuVal
=
currVal
;
accuVal
=
currVal
;
}
}
...
@@ -81,7 +72,7 @@ struct AccumulateWithIndexAndNanCheck;
...
@@ -81,7 +72,7 @@ struct AccumulateWithIndexAndNanCheck;
template
<
typename
ReduceOperation
,
typename
AccDataType
,
typename
IndexDataType
>
template
<
typename
ReduceOperation
,
typename
AccDataType
,
typename
IndexDataType
>
struct
AccumulateWithIndexAndNanCheck
<
false
,
ReduceOperation
,
AccDataType
,
IndexDataType
>
struct
AccumulateWithIndexAndNanCheck
<
false
,
ReduceOperation
,
AccDataType
,
IndexDataType
>
{
{
__device__
static
inline
void
__host__
__device__
static
inline
void
// cppcheck-suppress constParameter
// cppcheck-suppress constParameter
Calculate
(
AccDataType
&
accuVal
,
Calculate
(
AccDataType
&
accuVal
,
AccDataType
currVal
,
AccDataType
currVal
,
...
@@ -101,12 +92,14 @@ template <typename ReduceOperation, typename AccDataType, typename IndexDataType
...
@@ -101,12 +92,14 @@ template <typename ReduceOperation, typename AccDataType, typename IndexDataType
struct
AccumulateWithIndexAndNanCheck
<
true
,
ReduceOperation
,
AccDataType
,
IndexDataType
>
struct
AccumulateWithIndexAndNanCheck
<
true
,
ReduceOperation
,
AccDataType
,
IndexDataType
>
{
{
// The method is called when the ReduceOperation is indexable and the user asked for indices
// The method is called when the ReduceOperation is indexable and the user asked for indices
__device__
static
inline
void
Calculate
(
AccDataType
&
accuVal
,
__host__
__device__
static
inline
void
Calculate
(
AccDataType
&
accuVal
,
AccDataType
currVal
,
AccDataType
currVal
,
IndexDataType
&
accuIndex
,
IndexDataType
&
accuIndex
,
IndexDataType
currIndex
)
IndexDataType
currIndex
)
{
{
if
(
is_nan
(
currVal
))
using
ck
::
math
::
isnan
;
if
(
isnan
(
currVal
))
{
{
accuVal
=
currVal
;
accuVal
=
currVal
;
accuIndex
=
currIndex
;
accuIndex
=
currIndex
;
...
...
include/ck/utility/reduction_operator.hpp
View file @
a3b4c5cb
...
@@ -26,7 +26,8 @@
...
@@ -26,7 +26,8 @@
#ifndef CK_REDUCTION_OPERATOR_HPP
#ifndef CK_REDUCTION_OPERATOR_HPP
#define CK_REDUCTION_OPERATOR_HPP
#define CK_REDUCTION_OPERATOR_HPP
#include "common_header.hpp"
#include "config.hpp"
#include "data_type.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -35,18 +36,16 @@ namespace reduce {
...
@@ -35,18 +36,16 @@ namespace reduce {
// Every binary operator used in reduction is represented by a templated functor class. Each functor
// Every binary operator used in reduction is represented by a templated functor class. Each functor
// class must provide at least
// class must provide at least
// three members:
// three members:
// 1) Get
ReductionZero
Val() -- the interface to return the "identity element" for the binary
// 1) Get
Identity
Val
ue
() -- the interface to return the "identity element" for the binary
// operator, "identity element" is the unique
// operator, "identity element" is the unique
// element in the algebraic space that doesn't affect the value of other elements
// element in the algebraic space that doesn't affect the value of other elements
// when operated against them, and the concept is similar to zero vector in
// when operated against them, and the concept is similar to zero vector in
// vector space
// vector space
// (http://pages.cs.wisc.edu/~matthewb/pages/notes/pdf/linearalgebra/VectorSpaces.pdf).
// (http://pages.cs.wisc.edu/~matthewb/pages/notes/pdf/linearalgebra/VectorSpaces.pdf).
// 2) indexable -- boolean value indicating whether indices of the operated elements could be
// 2) IsCompatibleInMemoryDataOperation() -- return true if the reduction task corresponding to this
// recorded. Usually, Min/Max operator could
// operator can use the InMemoryDataOperation to finalize, or else it return false 3) operator() --
// need to record the indices of elements. For operator like Add/Mul, no need to
// the first argument of the operator must be both an input & output, and the corresponding variable
// record the indices.
// usually stores
// 3) operator() -- the first argument of the operator must be both an input & output, and the
// corresponding variable usually stores
// the accumulated result of many operator() calls; the second argument is only an
// the accumulated result of many operator() calls; the second argument is only an
// input. For indexable binary
// input. For indexable binary
// operator, the second version of operator() has third argument (which is an
// operator, the second version of operator() has third argument (which is an
...
@@ -60,7 +59,14 @@ struct Add
...
@@ -60,7 +59,14 @@ struct Add
{
{
using
dataType
=
T
;
using
dataType
=
T
;
__host__
__device__
static
constexpr
T
GetReductionZeroVal
()
{
return
static_cast
<
T
>
(
0.0
f
);
};
__host__
__device__
static
constexpr
T
GetIdentityValue
()
{
return
static_cast
<
T
>
(
0.0
f
);
};
__device__
static
constexpr
bool
IsCompatibleInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
{
return
operation
==
InMemoryDataOperationEnum
::
AtomicAdd
||
operation
==
InMemoryDataOperationEnum
::
Set
;
};
__host__
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
)
const
{
a
=
a
+
b
;
}
__host__
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
)
const
{
a
=
a
+
b
;
}
};
};
...
@@ -70,7 +76,13 @@ struct Mul
...
@@ -70,7 +76,13 @@ struct Mul
{
{
using
dataType
=
T
;
using
dataType
=
T
;
__host__
__device__
static
constexpr
T
GetReductionZeroVal
()
{
return
static_cast
<
T
>
(
1.0
f
);
};
__host__
__device__
static
constexpr
T
GetIdentityValue
()
{
return
static_cast
<
T
>
(
1.0
f
);
};
__device__
static
constexpr
bool
IsCompatibleInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
{
return
operation
==
InMemoryDataOperationEnum
::
Set
;
};
__host__
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
)
const
{
a
=
a
*
b
;
}
__host__
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
)
const
{
a
=
a
*
b
;
}
};
};
...
@@ -80,11 +92,18 @@ struct Max
...
@@ -80,11 +92,18 @@ struct Max
{
{
using
dataType
=
T
;
using
dataType
=
T
;
__host__
__device__
static
constexpr
T
Get
ReductionZero
Val
()
__host__
__device__
static
constexpr
T
Get
Identity
Val
ue
()
{
{
return
NumericLimits
<
T
>::
Lowest
();
return
NumericLimits
<
T
>::
Lowest
();
};
};
__device__
static
constexpr
bool
IsCompatibleInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
{
// ToChange: atomic_max to be added
return
operation
==
InMemoryDataOperationEnum
::
Set
;
};
__host__
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
)
const
__host__
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
)
const
{
{
if
(
a
<
b
)
if
(
a
<
b
)
...
@@ -106,9 +125,13 @@ struct Min
...
@@ -106,9 +125,13 @@ struct Min
{
{
using
dataType
=
T
;
using
dataType
=
T
;
__host__
__device__
static
constexpr
T
GetReductionZeroVal
()
__host__
__device__
static
constexpr
T
GetIdentityValue
()
{
return
NumericLimits
<
T
>::
Max
();
};
__device__
static
constexpr
bool
IsCompatibleInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
{
{
return
NumericLimits
<
T
>::
Max
();
// ToChange: atomic_min to be added
return
operation
==
InMemoryDataOperationEnum
::
Set
;
};
};
__host__
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
)
const
__host__
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
)
const
...
@@ -132,7 +155,14 @@ struct AMax
...
@@ -132,7 +155,14 @@ struct AMax
{
{
using
dataType
=
T
;
using
dataType
=
T
;
__host__
__device__
static
constexpr
T
GetReductionZeroVal
()
{
return
static_cast
<
T
>
(
0.0
f
);
};
__host__
__device__
static
constexpr
T
GetIdentityValue
()
{
return
static_cast
<
T
>
(
0.0
f
);
};
__device__
static
constexpr
bool
IsCompatibleInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
{
// ToChange: atomic_max to be added
return
operation
==
InMemoryDataOperationEnum
::
Set
;
};
__host__
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
)
const
__host__
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
)
const
{
{
...
@@ -150,6 +180,17 @@ struct AMax
...
@@ -150,6 +180,17 @@ struct AMax
}
}
};
};
template
<
typename
T
>
T
GetIdentityValueueForInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
{
T
result
=
ck
::
type_convert
<
T
>
(
0.0
f
);
if
(
operation
==
InMemoryDataOperationEnum
::
AtomicMax
)
result
=
ck
::
NumericLimits
<
T
>::
Lowest
();
return
(
result
);
};
};
// end of namespace reduce
};
// end of namespace reduce
}
// end of namespace ck
}
// end of namespace ck
...
...
include/ck/utility/static_buffer.hpp
View file @
a3b4c5cb
...
@@ -36,6 +36,11 @@ struct StaticBuffer : public StaticallyIndexedArray<T, N>
...
@@ -36,6 +36,11 @@ struct StaticBuffer : public StaticallyIndexedArray<T, N>
{
{
return
base
::
operator
()(
i
);
return
base
::
operator
()(
i
);
}
}
__host__
__device__
void
Clear
()
{
static_for
<
0
,
N
,
1
>
{}([
&
](
auto
i
)
{
operator
()(
i
)
=
T
{
0
};
});
}
};
};
// static buffer for vector
// static buffer for vector
...
@@ -146,9 +151,9 @@ struct StaticBufferTupleOfVector
...
@@ -146,9 +151,9 @@ struct StaticBufferTupleOfVector
__host__
__device__
void
Clear
()
__host__
__device__
void
Clear
()
{
{
const
index_t
n
umScalars
=
NumOfVector
*
ScalarPerVector
;
const
expr
index_t
N
umScalars
=
NumOfVector
*
ScalarPerVector
;
static_for
<
0
,
Num
ber
<
num
Scalars
>
{}
,
1
>
{}([
&
](
auto
i
)
{
SetAsType
(
i
,
S
{
0
});
});
static_for
<
0
,
NumScalars
,
1
>
{}([
&
](
auto
i
)
{
SetAsType
(
i
,
S
{
0
});
});
}
}
};
};
...
@@ -158,5 +163,11 @@ __host__ __device__ constexpr auto make_static_buffer(Number<N>)
...
@@ -158,5 +163,11 @@ __host__ __device__ constexpr auto make_static_buffer(Number<N>)
return
StaticBuffer
<
AddressSpace
,
T
,
N
,
true
>
{};
return
StaticBuffer
<
AddressSpace
,
T
,
N
,
true
>
{};
}
}
template
<
AddressSpaceEnum
AddressSpace
,
typename
T
,
long_index_t
N
>
__host__
__device__
constexpr
auto
make_static_buffer
(
LongNumber
<
N
>
)
{
return
StaticBuffer
<
AddressSpace
,
T
,
N
,
true
>
{};
}
}
// namespace ck
}
// namespace ck
#endif
#endif
include/ck/utility/statically_indexed_array_multi_index.hpp
View file @
a3b4c5cb
...
@@ -93,6 +93,13 @@ __host__ __device__ constexpr auto operator*(index_t a, const Tuple<Xs...>& x)
...
@@ -93,6 +93,13 @@ __host__ __device__ constexpr auto operator*(index_t a, const Tuple<Xs...>& x)
return
r
;
return
r
;
}
}
// MultiIndex = MultiIndex * index_t
template
<
typename
...
Xs
>
__host__
__device__
constexpr
auto
operator
*
(
const
Tuple
<
Xs
...
>&
x
,
index_t
a
)
{
return
a
*
x
;
}
template
<
typename
...
Xs
>
template
<
typename
...
Xs
>
__host__
__device__
void
print_multi_index
(
const
Tuple
<
Xs
...
>&
x
)
__host__
__device__
void
print_multi_index
(
const
Tuple
<
Xs
...
>&
x
)
{
{
...
...
include/ck/utility/thread_group.hpp
0 → 100644
View file @
a3b4c5cb
#pragma once
#include "get_id.hpp"
namespace
ck
{
template
<
index_t
ThreadPerBlock
>
struct
ThisThreadBlock
{
static
constexpr
index_t
kNumThread_
=
ThreadPerBlock
;
__device__
static
constexpr
index_t
GetNumOfThread
()
{
return
kNumThread_
;
}
__device__
static
constexpr
bool
IsBelong
()
{
return
true
;
}
__device__
static
index_t
GetThreadId
()
{
return
get_thread_local_1d_id
();
}
};
}
// namespace ck
include/ck/utility/tuple.hpp
View file @
a3b4c5cb
...
@@ -21,9 +21,9 @@ struct TupleElement
...
@@ -21,9 +21,9 @@ struct TupleElement
{
{
__host__
__device__
constexpr
TupleElement
()
=
default
;
__host__
__device__
constexpr
TupleElement
()
=
default
;
template
<
typename
T
,
template
<
typename
enable_if
<!
is_same
<
remove_reference_t
<
remove_cv_t
<
T
>
>
,
TupleElement
>::
value
,
typename
T
,
bool
>::
type
=
false
>
typename
enable_if
<!
is_same
<
remove_cvref_t
<
T
>,
TupleElement
>::
value
,
bool
>::
type
=
false
>
__host__
__device__
constexpr
TupleElement
(
T
&&
v
)
:
mData
(
std
::
forward
<
T
>
(
v
))
__host__
__device__
constexpr
TupleElement
(
T
&&
v
)
:
mData
(
std
::
forward
<
T
>
(
v
))
{
{
}
}
...
@@ -60,7 +60,7 @@ struct TupleImpl<Sequence<Is...>, Xs...> : TupleElement<TupleElementKey<Is>, Xs>
...
@@ -60,7 +60,7 @@ struct TupleImpl<Sequence<Is...>, Xs...> : TupleElement<TupleElementKey<Is>, Xs>
template
<
typename
Y
,
template
<
typename
Y
,
typename
enable_if
<
sizeof
...(
Is
)
==
1
&&
sizeof
...(
Xs
)
==
1
&&
typename
enable_if
<
sizeof
...(
Is
)
==
1
&&
sizeof
...(
Xs
)
==
1
&&
!
is_same
<
remove_ref
erence_t
<
remove_cv
_t
<
Y
>
>
,
TupleImpl
>::
value
,
!
is_same
<
remove_
cv
ref_t
<
Y
>,
TupleImpl
>::
value
,
bool
>::
type
=
false
>
bool
>::
type
=
false
>
__host__
__device__
constexpr
TupleImpl
(
Y
&&
y
)
__host__
__device__
constexpr
TupleImpl
(
Y
&&
y
)
:
TupleElement
<
TupleElementKey
<
Is
>
,
Xs
>
(
std
::
forward
<
Y
>
(
y
))...
:
TupleElement
<
TupleElementKey
<
Is
>
,
Xs
>
(
std
::
forward
<
Y
>
(
y
))...
...
@@ -101,8 +101,7 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X
...
@@ -101,8 +101,7 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X
__host__
__device__
constexpr
Tuple
()
=
default
;
__host__
__device__
constexpr
Tuple
()
=
default
;
template
<
typename
Y
,
template
<
typename
Y
,
typename
enable_if
<
sizeof
...(
Xs
)
==
1
&&
typename
enable_if
<
sizeof
...(
Xs
)
==
1
&&
!
is_same
<
remove_cvref_t
<
Y
>,
Tuple
>::
value
,
!
is_same
<
remove_reference_t
<
remove_cv_t
<
Y
>
>
,
Tuple
>::
value
,
bool
>::
type
=
false
>
bool
>::
type
=
false
>
__host__
__device__
constexpr
Tuple
(
Y
&&
y
)
:
base
(
std
::
forward
<
Y
>
(
y
))
__host__
__device__
constexpr
Tuple
(
Y
&&
y
)
:
base
(
std
::
forward
<
Y
>
(
y
))
{
{
...
...
include/ck/utility/type.hpp
View file @
a3b4c5cb
...
@@ -29,6 +29,9 @@ using remove_cv_t = typename std::remove_cv<T>::type;
...
@@ -29,6 +29,9 @@ using remove_cv_t = typename std::remove_cv<T>::type;
template
<
typename
T
>
template
<
typename
T
>
using
remove_cvref_t
=
remove_cv_t
<
std
::
remove_reference_t
<
T
>>
;
using
remove_cvref_t
=
remove_cv_t
<
std
::
remove_reference_t
<
T
>>
;
template
<
typename
T
>
using
remove_pointer_t
=
typename
std
::
remove_pointer
<
T
>::
type
;
template
<
typename
T
>
template
<
typename
T
>
inline
constexpr
bool
is_pointer_v
=
std
::
is_pointer
<
T
>::
value
;
inline
constexpr
bool
is_pointer_v
=
std
::
is_pointer
<
T
>::
value
;
...
...
library/include/ck/library/host/host_interface.hpp
0 → 100644
View file @
a3b4c5cb
#pragma once
#include <memory>
#include <string>
#include "stream_config.hpp"
#include "config.hpp"
#include "device_base.hpp"
struct
DeviceConvFwdPtr_t
{
using
BaseArgument
=
ck
::
tensor_operation
::
device
::
BaseArgument
;
using
BaseInvoker
=
ck
::
tensor_operation
::
device
::
BaseInvoker
;
struct
DeviceConvFwdPtrImpl
;
std
::
unique_ptr
<
DeviceConvFwdPtrImpl
>
pImpl
;
DeviceConvFwdPtr_t
();
~
DeviceConvFwdPtr_t
();
DeviceConvFwdPtr_t
(
DeviceConvFwdPtr_t
&&
);
DeviceConvFwdPtr_t
(
DeviceConvFwdPtrImpl
&
);
DeviceConvFwdPtr_t
&
operator
=
(
DeviceConvFwdPtr_t
&
)
=
delete
;
DeviceConvFwdPtr_t
&
operator
=
(
const
DeviceConvFwdPtr_t
&
)
=
delete
;
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
void
*
in_ptr
,
void
*
wei_ptr
,
void
*
out_ptr
,
size_t
N
,
size_t
K
,
size_t
C
,
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
output_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
,
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
)
const
;
// in,wei and out element ops are ignored for now since even if we change them, they
// cant be linked
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
const
;
// requires including BaseInvoker headers
std
::
string
GetTypeString
();
bool
IsSupportedArgument
(
const
BaseArgument
*
arg_ptr
);
};
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t
(
std
::
vector
<
DeviceConvFwdPtr_t
>&
instances
);
void
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t
(
std
::
vector
<
DeviceConvFwdPtr_t
>&
instances
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t
(
std
::
vector
<
DeviceConvFwdPtr_t
>&
instances
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t
(
std
::
vector
<
DeviceConvFwdPtr_t
>&
instances
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t
(
std
::
vector
<
DeviceConvFwdPtr_t
>&
instances
);
library/include/ck/library/host_tensor/device.hpp
View file @
a3b4c5cb
#ifndef DEVICE_HPP
#pragma once
#define DEVICE_HPP
#include <memory>
#include <memory>
#include <functional>
#include <functional>
#include <thread>
#include <thread>
#include <chrono>
#include <chrono>
#include "hip/hip_runtime.h"
#include <hip/hip_runtime.h>
#include "hip/hip_fp16.h"
#include <hip/hip_fp16.h>
#include "stream_config.hpp"
#include "ck/options.hpp"
template
<
typename
T
>
__global__
void
set_buffer_value
(
T
*
p
,
T
x
,
uint64_t
buffer_element_size
)
{
for
(
uint64_t
i
=
threadIdx
.
x
;
i
<
buffer_element_size
;
i
+=
blockDim
.
x
)
{
p
[
i
]
=
x
;
}
}
inline
void
hip_check_error
(
hipError_t
x
)
{
if
(
x
!=
hipSuccess
)
{
std
::
ostringstream
ss
;
ss
<<
"HIP runtime error: "
<<
hipGetErrorString
(
x
)
<<
". "
<<
__FILE__
<<
": "
<<
__LINE__
<<
"in function: "
<<
__func__
;
throw
std
::
runtime_error
(
ss
.
str
());
}
}
struct
DeviceMem
struct
DeviceMem
{
{
...
@@ -17,6 +39,16 @@ struct DeviceMem
...
@@ -17,6 +39,16 @@ struct DeviceMem
void
ToDevice
(
const
void
*
p
);
void
ToDevice
(
const
void
*
p
);
void
FromDevice
(
void
*
p
);
void
FromDevice
(
void
*
p
);
void
SetZero
();
void
SetZero
();
template
<
typename
T
>
void
SetValue
(
T
x
)
{
if
(
mMemSize
%
sizeof
(
T
)
!=
0
)
{
throw
std
::
runtime_error
(
"wrong! not entire DeviceMem will be set"
);
}
set_buffer_value
<
T
><<<
1
,
1024
>>>
(
static_cast
<
T
*>
(
mpDeviceBuf
),
x
,
mMemSize
/
sizeof
(
T
));
}
~
DeviceMem
();
~
DeviceMem
();
void
*
mpDeviceBuf
;
void
*
mpDeviceBuf
;
...
@@ -36,49 +68,56 @@ struct KernelTimer
...
@@ -36,49 +68,56 @@ struct KernelTimer
std
::
unique_ptr
<
KernelTimerImpl
>
impl
;
std
::
unique_ptr
<
KernelTimerImpl
>
impl
;
};
};
using
device_stream_t
=
hipStream_t
;
template
<
typename
...
Args
,
typename
F
>
template
<
typename
...
Args
,
typename
F
>
void
launch_kernel
(
F
kernel
,
dim3
grid_dim
,
dim3
block_dim
,
std
::
size_t
lds_byte
,
Args
...
args
)
float
launch_and_time_kernel
(
const
StreamConfig
&
stream_config
,
F
kernel
,
dim3
grid_dim
,
dim3
block_dim
,
std
::
size_t
lds_byte
,
Args
...
args
)
{
{
hipStream_t
stream_id
=
nullptr
;
#if CK_TIME_KERNEL
if
(
stream_config
.
time_kernel_
)
hipLaunchKernelGGL
(
kernel
,
grid_dim
,
block_dim
,
lds_byte
,
stream_id
,
args
...);
{
}
printf
(
"%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d}
\n
"
,
__func__
,
grid_dim
.
x
,
grid_dim
.
y
,
grid_dim
.
z
,
block_dim
.
x
,
block_dim
.
y
,
block_dim
.
z
);
template
<
typename
...
Args
,
typename
F
>
const
int
nrepeat
=
10
;
float
launch_and_time_kernel
(
F
kernel
,
int
nrepeat
,
dim3
grid_dim
,
dim3
block_dim
,
std
::
size_t
lds_byte
,
Args
...
args
)
{
KernelTimer
timer
;
printf
(
"%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d}
\n
"
,
printf
(
"Warm up 1 time
\n
"
);
__func__
,
grid_dim
.
x
,
grid_dim
.
y
,
grid_dim
.
z
,
block_dim
.
x
,
block_dim
.
y
,
block_dim
.
z
);
printf
(
"Warm up
\n
"
);
// warm up
kernel
<<<
grid_dim
,
block_dim
,
lds_byte
,
stream_config
.
stream_id_
>>>
(
args
...);
hipStream_t
stream_id
=
nullptr
;
printf
(
"Start running %d times...
\n
"
,
nrepeat
)
;
// warm up
KernelTimer
timer
;
hipLaunchKernelGGL
(
kernel
,
grid_dim
,
block_dim
,
lds_byte
,
stream_id
,
args
...
);
timer
.
Start
(
);
printf
(
"Start running %d times...
\n
"
,
nrepeat
);
for
(
int
i
=
0
;
i
<
nrepeat
;
++
i
)
{
kernel
<<<
grid_dim
,
block_dim
,
lds_byte
,
stream_config
.
stream_id_
>>>
(
args
...);
}
timer
.
Start
();
timer
.
End
();
for
(
int
i
=
0
;
i
<
nrepeat
;
++
i
)
return
timer
.
GetElapsedTime
()
/
nrepeat
;
{
hipLaunchKernelGGL
(
kernel
,
grid_dim
,
block_dim
,
lds_byte
,
stream_id
,
args
...);
}
}
else
{
kernel
<<<
grid_dim
,
block_dim
,
lds_byte
,
stream_config
.
stream_id_
>>>
(
args
...);
timer
.
End
();
return
0
;
}
#else
kernel
<<<
grid_dim
,
block_dim
,
lds_byte
,
stream_config
.
stream_id_
>>>
(
args
...);
return
timer
.
GetElapsedTime
()
/
nrepeat
;
return
0
;
}
#endif
#endif
}
library/include/ck/library/host_tensor/host_common_util.hpp
0 → 100644
View file @
a3b4c5cb
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef GUARD_HOST_COMMON_UTIL_HPP
#define GUARD_HOST_COMMON_UTIL_HPP
#include <vector>
#include <iostream>
#include <fstream>
#include <string>
#include "config.hpp"
namespace
ck
{
namespace
host_common
{
template
<
typename
T
>
static
inline
void
dumpBufferToFile
(
const
char
*
fileName
,
T
*
data
,
size_t
dataNumItems
)
{
std
::
ofstream
outFile
(
fileName
,
std
::
ios
::
binary
);
if
(
outFile
)
{
outFile
.
write
(
reinterpret_cast
<
char
*>
(
data
),
dataNumItems
*
sizeof
(
T
));
outFile
.
close
();
std
::
cout
<<
"Write output to file "
<<
fileName
<<
std
::
endl
;
}
else
{
std
::
cout
<<
"Could not open file "
<<
fileName
<<
" for writing"
<<
std
::
endl
;
}
};
template
<
typename
T
>
static
inline
T
getSingleValueFromString
(
const
std
::
string
&
valueStr
)
{
std
::
istringstream
iss
(
valueStr
);
T
val
;
iss
>>
val
;
return
(
val
);
};
template
<
typename
T
>
static
inline
std
::
vector
<
T
>
getTypeValuesFromString
(
const
char
*
cstr_values
)
{
std
::
string
valuesStr
(
cstr_values
);
std
::
vector
<
T
>
values
;
std
::
size_t
pos
=
0
;
std
::
size_t
new_pos
;
new_pos
=
valuesStr
.
find
(
','
,
pos
);
while
(
new_pos
!=
std
::
string
::
npos
)
{
const
std
::
string
sliceStr
=
valuesStr
.
substr
(
pos
,
new_pos
-
pos
);
T
val
=
getSingleValueFromString
<
T
>
(
sliceStr
);
values
.
push_back
(
val
);
pos
=
new_pos
+
1
;
new_pos
=
valuesStr
.
find
(
','
,
pos
);
};
std
::
string
sliceStr
=
valuesStr
.
substr
(
pos
);
T
val
=
getSingleValueFromString
<
T
>
(
sliceStr
);
values
.
push_back
(
val
);
return
(
values
);
}
};
// namespace host_common
};
// namespace ck
#endif
library/include/ck/library/host_tensor/host_reduce_util.hpp
deleted
100644 → 0
View file @
48918ab9
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef GUARD_HOST_REDUCE_UTIL_HPP
#define GUARD_HOST_REDUCE_UTIL_HPP
#include <limits>
#include <cmath>
#include <cassert>
#include <stdexcept>
#include <string>
#include "reduction_enums.hpp"
#include "data_type.hpp"
#include "math_v2.hpp"
namespace
ck
{
namespace
host_reduce
{
using
ck
::
NanPropagation
;
using
ck
::
ReduceTensorOp
;
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccDataType
&
)
>
PreUnaryOpFn
(
int
)
{
using
ck
::
math
::
abs
;
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
NORM1
)
{
return
([
&
](
AccDataType
&
a_
)
{
a_
=
abs
(
a_
);
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
NORM2
)
{
return
([
&
](
AccDataType
&
a_
)
{
a_
=
a_
*
a_
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
AMAX
)
{
return
([
&
](
AccDataType
&
a_
)
{
a_
=
abs
(
a_
);
});
}
else
{
// ReduceTensorOp::AVG:
// ReduceTensorOp::ADD:
// ReduceTensorOp::MUL:
// ReduceTensorOp::MIN:
// ReduceTensorOp::MAX:
return
([
&
](
AccDataType
&
)
{});
};
};
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccDataType
&
)
>
PosUnaryOpFn
(
int32_t
divider
)
{
using
std
::
sqrt
;
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
NORM2
)
{
return
([
&
](
AccDataType
&
a_
)
{
a_
=
sqrt
(
a_
);
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
AVG
)
{
return
([
&
,
divider
](
AccDataType
&
a_
)
{
a_
=
a_
/
static_cast
<
AccDataType
>
(
static_cast
<
float
>
(
divider
));
});
}
else
{
// ReduceTensorOp::ADD:
// ReduceTensorOp::NORM1:
// ReduceTensorOp::MUL:
// ReduceTensorOp::MIN:
// ReduceTensorOp::MAX:
// ReduceTensorOp::AMAX:
return
([
&
](
AccDataType
&
)
{});
}
};
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccDataType
&
,
AccDataType
)
>
ReduceOpFn
()
{
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
ADD
||
ReduceOpId
==
ReduceTensorOp
::
AVG
||
ReduceOpId
==
ReduceTensorOp
::
NORM1
||
ReduceOpId
==
ReduceTensorOp
::
NORM2
)
{
return
([
&
](
AccDataType
&
a_
,
AccDataType
b_
)
{
a_
=
a_
+
b_
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MUL
)
{
return
([
&
](
AccDataType
&
a_
,
AccDataType
b_
)
{
a_
=
a_
*
b_
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MIN
)
{
return
([
&
](
AccDataType
&
a_
,
AccDataType
b_
)
{
if
(
a_
>
b_
)
a_
=
b_
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
)
{
return
([
&
](
AccDataType
&
a_
,
AccDataType
b_
)
{
if
(
a_
<
b_
)
a_
=
b_
;
});
}
};
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccDataType
&
,
AccDataType
,
bool
&
changed
)
>
ReduceOpFn2
()
{
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MIN
)
{
return
([
&
](
AccDataType
&
a_
,
AccDataType
b_
,
bool
&
changed
)
{
if
(
a_
>
b_
)
{
a_
=
b_
;
changed
=
true
;
}
else
changed
=
false
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
)
{
return
([
&
](
AccDataType
&
a_
,
AccDataType
b_
,
bool
&
changed
)
{
if
(
a_
<
b_
)
{
a_
=
b_
;
changed
=
true
;
}
else
changed
=
false
;
});
}
else
{
// ReduceTensorOp::ADD:
// ReduceTensorOp::MUL:
// ReduceTensorOp::AVG:
// ReduceTensorOp::NORM1:
// ReduceTensorOp::NORM2:
return
(
std
::
function
<
void
(
AccDataType
&
,
AccDataType
,
bool
&
)
>
{});
};
};
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
AccDataType
ReduceOpZeroVal
()
{
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MUL
)
{
return
(
static_cast
<
AccDataType
>
(
1.0
f
));
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MIN
)
{
return
(
ck
::
NumericLimits
<
AccDataType
>::
Max
());
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MAX
)
{
return
(
ck
::
NumericLimits
<
AccDataType
>::
Lowest
());
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
AMAX
)
{
return
(
static_cast
<
AccDataType
>
(
0.0
f
));
}
else
{
// ReduceTensorOp::ADD
// ReduceTensorOp::AVG
// ReduceTensorOp::NORM1
// ReduceTensorOp::NORM2
return
(
static_cast
<
AccDataType
>
(
0.0
f
));
};
};
template
<
typename
AccDataType
,
bool
PropagateNan
>
__host__
static
inline
void
binop_with_nan_check
(
std
::
function
<
void
(
AccDataType
&
,
AccDataType
)
>
opReduce
,
AccDataType
&
accuVal
,
AccDataType
currVal
)
{
using
ck
::
math
::
isnan
;
if
constexpr
(
!
PropagateNan
)
{
opReduce
(
accuVal
,
currVal
);
}
else
{
if
(
isnan
(
currVal
))
accuVal
=
currVal
;
else
opReduce
(
accuVal
,
currVal
);
};
};
template
<
typename
AccDataType
,
bool
PropagateNan
>
__host__
static
inline
void
binop_with_nan_check2
(
std
::
function
<
void
(
AccDataType
&
,
AccDataType
,
bool
&
)
>
opReduce
,
AccDataType
&
accuVal
,
AccDataType
currVal
,
int
&
accuIndex
,
int
currIndex
)
{
using
ck
::
math
::
isnan
;
if
constexpr
(
!
PropagateNan
)
{
bool
changed
;
opReduce
(
accuVal
,
currVal
,
changed
);
if
(
changed
)
accuIndex
=
currIndex
;
}
else
{
if
(
isnan
(
currVal
))
{
accuVal
=
currVal
;
accuIndex
=
currIndex
;
}
else
{
bool
changed
;
opReduce
(
accuVal
,
currVal
,
changed
);
if
(
changed
)
accuIndex
=
currIndex
;
};
};
};
};
// namespace host_reduce
static
inline
std
::
vector
<
int
>
to_int_vector
(
const
std
::
vector
<
size_t
>&
inData
)
{
std
::
vector
<
int
>
outData
;
for
(
auto
elem
:
inData
)
outData
.
push_back
(
static_cast
<
int
>
(
elem
));
return
(
outData
);
};
};
// namespace ck
#endif
library/include/ck/library/host_tensor/host_reduction.hpp
View file @
a3b4c5cb
...
@@ -33,9 +33,10 @@
...
@@ -33,9 +33,10 @@
#include "reduction_enums.hpp"
#include "reduction_enums.hpp"
#include "reduction_common.hpp"
#include "reduction_common.hpp"
#include "host_
reduce
_util.hpp"
#include "host_
common
_util.hpp"
#include "host_tensor.hpp"
#include "host_tensor.hpp"
#include "data_type.hpp"
#include "data_type.hpp"
#include "reduction_functions_accumulate.hpp"
template
<
int
NDim
>
template
<
int
NDim
>
static
void
get_all_indexes
(
const
std
::
array
<
size_t
,
NDim
>&
dimLengths
,
static
void
get_all_indexes
(
const
std
::
array
<
size_t
,
NDim
>&
dimLengths
,
...
@@ -105,11 +106,13 @@ static size_t get_offset_from_index(const std::vector<size_t>& strides,
...
@@ -105,11 +106,13 @@ static size_t get_offset_from_index(const std::vector<size_t>& strides,
template
<
typename
InDataType
,
template
<
typename
InDataType
,
typename
AccDataType
,
typename
AccDataType
,
typename
OutDataType
,
typename
OutDataType
,
ck
::
ReduceTensorOp
ReduceOpId
,
typename
ReduceOperation
,
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
,
int
Rank
,
int
Rank
,
int
NumReduceDim
,
int
NumReduceDim
,
bool
PropagateNan
,
bool
PropagateNan
,
bool
NeedIndices
>
bool
OutputIndex
>
struct
ReductionHost
struct
ReductionHost
{
{
using
IndexDataType
=
int32_t
;
using
IndexDataType
=
int32_t
;
...
@@ -121,8 +124,6 @@ struct ReductionHost
...
@@ -121,8 +124,6 @@ struct ReductionHost
std
::
vector
<
int
>
reduceDims
;
std
::
vector
<
int
>
reduceDims
;
IndexDataType
divider
;
IndexDataType
divider
;
std
::
function
<
void
(
AccDataType
&
)
>
preUnaryOp
;
std
::
function
<
void
(
AccDataType
&
)
>
posUnaryOp
;
std
::
array
<
size_t
,
NumReduceDim
>
reduceLengths
;
std
::
array
<
size_t
,
NumReduceDim
>
reduceLengths
;
std
::
array
<
size_t
,
NumReduceDim
>
reduceStrides
;
std
::
array
<
size_t
,
NumReduceDim
>
reduceStrides
;
std
::
array
<
size_t
,
NumInvariantDim
>
invariantLengths
;
std
::
array
<
size_t
,
NumInvariantDim
>
invariantLengths
;
...
@@ -136,9 +137,6 @@ struct ReductionHost
...
@@ -136,9 +137,6 @@ struct ReductionHost
const
std
::
vector
<
int
>&
invariantDims_
,
const
std
::
vector
<
int
>&
invariantDims_
,
const
std
::
vector
<
int
>&
reduceDims_
)
const
std
::
vector
<
int
>&
reduceDims_
)
{
{
using
ck
::
host_reduce
::
PosUnaryOpFn
;
using
ck
::
host_reduce
::
PreUnaryOpFn
;
// this->outLengths = to_int_vector(outDesc.GetLengths());
// this->outLengths = to_int_vector(outDesc.GetLengths());
this
->
outStrides
=
outDesc
.
GetStrides
();
this
->
outStrides
=
outDesc
.
GetStrides
();
...
@@ -170,9 +168,6 @@ struct ReductionHost
...
@@ -170,9 +168,6 @@ struct ReductionHost
invariant_dim_indexes
.
clear
();
invariant_dim_indexes
.
clear
();
get_all_indexes
<
NumInvariantDim
>
(
invariantLengths
,
invariant_dim_indexes
);
get_all_indexes
<
NumInvariantDim
>
(
invariantLengths
,
invariant_dim_indexes
);
};
};
preUnaryOp
=
PreUnaryOpFn
<
AccDataType
,
ReduceOpId
>
(
divider
);
posUnaryOp
=
PosUnaryOpFn
<
AccDataType
,
ReduceOpId
>
(
divider
);
};
};
void
Run
(
float
alpha
,
void
Run
(
float
alpha
,
...
@@ -181,7 +176,7 @@ struct ReductionHost
...
@@ -181,7 +176,7 @@ struct ReductionHost
OutDataType
*
out_data
,
OutDataType
*
out_data
,
IndexDataType
*
out_indices
)
IndexDataType
*
out_indices
)
{
{
if
constexpr
(
NeedIndices
)
if
constexpr
(
OutputIndex
)
{
{
RunImpl_with_index
(
alpha
,
in_data
,
beta
,
out_data
,
out_indices
);
RunImpl_with_index
(
alpha
,
in_data
,
beta
,
out_data
,
out_indices
);
}
}
...
@@ -200,33 +195,34 @@ struct ReductionHost
...
@@ -200,33 +195,34 @@ struct ReductionHost
using
ck
::
float_equal_one
;
using
ck
::
float_equal_one
;
using
ck
::
float_equal_zero
;
using
ck
::
float_equal_zero
;
using
ck
::
type_convert
;
using
ck
::
type_convert
;
using
ck
::
host_reduce
::
binop_with_nan_check2
;
using
ck
::
host_reduce
::
ReduceOpFn2
;
using
ck
::
host_reduce
::
ReduceOpZeroVal
;
auto
opReduce2
=
ReduceOpFn2
<
AccDataType
,
ReduceOpId
>
();
using
Accumulation
=
ck
::
detail
::
AccumulateWithIndexAndNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
,
IndexDataType
>
;
InElementwiseOperation
in_elementwise_op
(
divider
);
AccElementwiseOperation
acc_elementwise_op
(
divider
);
if
constexpr
(
NumInvariantDim
==
0
)
if
constexpr
(
NumInvariantDim
==
0
)
{
{
AccDataType
accuVal
=
ReduceOp
Z
er
oVal
<
AccDataType
,
ReduceOpId
>
();
AccDataType
accuVal
=
ReduceOper
ation
::
GetIdentityValue
();
IndexDataType
accuIndex
=
0
;
IndexDataType
accuIndex
=
0
;
for
(
IndexDataType
i
=
0
;
i
<
reduce_dim_indexes
.
size
();
i
++
)
for
(
std
::
size_t
i
=
0
;
i
<
reduce_dim_indexes
.
size
();
i
++
)
{
{
auto
offset_reduce
=
auto
offset_reduce
=
get_offset_from_index
<
NumReduceDim
>
(
reduceStrides
,
reduce_dim_indexes
[
i
]);
get_offset_from_index
<
NumReduceDim
>
(
reduceStrides
,
reduce_dim_indexes
[
i
]);
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_reduce
]);
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_reduce
]);
preUnaryOp
(
currVal
);
in_elementwise_op
(
currVal
,
currVal
);
auto
currIndex
=
i
;
auto
currIndex
=
static_cast
<
IndexDataType
>
(
i
)
;
binop_with_nan_check2
<
AccDataType
,
PropagateNan
>
(
Accumulation
::
Calculate
(
accuVal
,
currVal
,
accuIndex
,
currIndex
);
opReduce2
,
accuVal
,
currVal
,
accuIndex
,
currIndex
);
};
};
posUnaryOp
(
accuVal
);
acc_elementwise_op
(
accuVal
,
accuVal
);
if
(
!
float_equal_one
{}(
alpha
))
if
(
!
float_equal_one
{}(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
...
@@ -240,13 +236,13 @@ struct ReductionHost
...
@@ -240,13 +236,13 @@ struct ReductionHost
else
else
{
{
auto
thread_reduce_func
=
[
&
](
auto
invariant_index
)
{
auto
thread_reduce_func
=
[
&
](
auto
invariant_index
)
{
AccDataType
accuVal
=
ReduceOp
Z
er
oVal
<
AccDataType
,
ReduceOpId
>
();
AccDataType
accuVal
=
ReduceOper
ation
::
GetIdentityValue
();
IndexDataType
accuIndex
=
0
;
IndexDataType
accuIndex
=
0
;
auto
offset_invariant
=
auto
offset_invariant
=
get_offset_from_index
<
NumInvariantDim
>
(
invariantStrides
,
invariant_index
);
get_offset_from_index
<
NumInvariantDim
>
(
invariantStrides
,
invariant_index
);
for
(
IndexDataType
i
=
0
;
i
<
reduce_dim_indexes
.
size
();
i
++
)
for
(
std
::
size_t
i
=
0
;
i
<
reduce_dim_indexes
.
size
();
i
++
)
{
{
auto
offset_reduce
=
auto
offset_reduce
=
get_offset_from_index
<
NumReduceDim
>
(
reduceStrides
,
reduce_dim_indexes
[
i
]);
get_offset_from_index
<
NumReduceDim
>
(
reduceStrides
,
reduce_dim_indexes
[
i
]);
...
@@ -254,15 +250,14 @@ struct ReductionHost
...
@@ -254,15 +250,14 @@ struct ReductionHost
auto
currVal
=
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_invariant
+
offset_reduce
]);
type_convert
<
AccDataType
>
(
in_data
[
offset_invariant
+
offset_reduce
]);
preUnaryOp
(
currVal
);
in_elementwise_op
(
currVal
,
currVal
);
auto
currIndex
=
i
;
auto
currIndex
=
static_cast
<
IndexDataType
>
(
i
)
;
binop_with_nan_check2
<
AccDataType
,
PropagateNan
>
(
Accumulation
::
Calculate
(
accuVal
,
currVal
,
accuIndex
,
currIndex
);
opReduce2
,
accuVal
,
currVal
,
accuIndex
,
currIndex
);
};
};
posUnaryOp
(
accuVal
);
acc_elementwise_op
(
accuVal
,
accuVal
);
if
(
!
float_equal_one
{}(
alpha
))
if
(
!
float_equal_one
{}(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
...
@@ -307,15 +302,16 @@ struct ReductionHost
...
@@ -307,15 +302,16 @@ struct ReductionHost
using
ck
::
float_equal_one
;
using
ck
::
float_equal_one
;
using
ck
::
float_equal_zero
;
using
ck
::
float_equal_zero
;
using
ck
::
type_convert
;
using
ck
::
type_convert
;
using
ck
::
host_reduce
::
binop_with_nan_check
;
using
ck
::
host_reduce
::
ReduceOpFn
;
using
ck
::
host_reduce
::
ReduceOpZeroVal
;
auto
opReduce
=
ReduceOpFn
<
AccDataType
,
ReduceOpId
>
();
using
Accumulation
=
ck
::
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
>
;
InElementwiseOperation
in_elementwise_op
(
divider
);
AccElementwiseOperation
acc_elementwise_op
(
divider
);
if
constexpr
(
NumInvariantDim
==
0
)
if
constexpr
(
NumInvariantDim
==
0
)
{
{
AccDataType
accuVal
=
ReduceOp
Z
er
oVal
<
AccDataType
,
ReduceOpId
>
();
AccDataType
accuVal
=
ReduceOper
ation
::
GetIdentityValue
();
for
(
const
auto
&
reduce_index
:
reduce_dim_indexes
)
for
(
const
auto
&
reduce_index
:
reduce_dim_indexes
)
{
{
...
@@ -324,12 +320,12 @@ struct ReductionHost
...
@@ -324,12 +320,12 @@ struct ReductionHost
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_reduce
]);
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_reduce
]);
preUnaryOp
(
currVal
);
in_elementwise_op
(
currVal
,
currVal
);
binop_with_nan_check
<
AccDataType
,
PropagateNan
>
(
opReduce
,
accuVal
,
currVal
);
Accumulation
::
Calculate
(
accuVal
,
currVal
);
};
};
posUnaryOp
(
accuVal
);
acc_elementwise_op
(
accuVal
,
accuVal
);
if
(
!
float_equal_one
{}(
alpha
))
if
(
!
float_equal_one
{}(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
...
@@ -342,7 +338,7 @@ struct ReductionHost
...
@@ -342,7 +338,7 @@ struct ReductionHost
else
else
{
{
auto
thread_reduce_func
=
[
&
](
auto
invariant_index
)
{
auto
thread_reduce_func
=
[
&
](
auto
invariant_index
)
{
AccDataType
accuVal
=
ReduceOp
Z
er
oVal
<
AccDataType
,
ReduceOpId
>
();
AccDataType
accuVal
=
ReduceOper
ation
::
GetIdentityValue
();
auto
offset_invariant
=
auto
offset_invariant
=
get_offset_from_index
<
NumInvariantDim
>
(
invariantStrides
,
invariant_index
);
get_offset_from_index
<
NumInvariantDim
>
(
invariantStrides
,
invariant_index
);
...
@@ -355,12 +351,12 @@ struct ReductionHost
...
@@ -355,12 +351,12 @@ struct ReductionHost
auto
currVal
=
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_invariant
+
offset_reduce
]);
type_convert
<
AccDataType
>
(
in_data
[
offset_invariant
+
offset_reduce
]);
preUnaryOp
(
currVal
);
in_elementwise_op
(
currVal
,
currVal
);
binop_with_nan_check
<
AccDataType
,
PropagateNan
>
(
opReduce
,
accuVal
,
currVal
);
Accumulation
::
Calculate
(
accuVal
,
currVal
);
};
};
posUnaryOp
(
accuVal
);
acc_elementwise_op
(
accuVal
,
accuVal
);
if
(
!
float_equal_one
{}(
alpha
))
if
(
!
float_equal_one
{}(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
...
...
library/include/ck/library/host_tensor/host_tensor.hpp
View file @
a3b4c5cb
...
@@ -154,7 +154,7 @@ struct ParallelTensorFunctor
...
@@ -154,7 +154,7 @@ struct ParallelTensorFunctor
{
{
std
::
array
<
std
::
size_t
,
NDIM
>
indices
;
std
::
array
<
std
::
size_t
,
NDIM
>
indices
;
for
(
in
t
idim
=
0
;
idim
<
NDIM
;
++
idim
)
for
(
std
::
size_
t
idim
=
0
;
idim
<
NDIM
;
++
idim
)
{
{
indices
[
idim
]
=
i
/
mStrides
[
idim
];
indices
[
idim
]
=
i
/
mStrides
[
idim
];
i
-=
indices
[
idim
]
*
mStrides
[
idim
];
i
-=
indices
[
idim
]
*
mStrides
[
idim
];
...
@@ -316,7 +316,7 @@ float check_error(const Tensor<T>& ref, const Tensor<T>& result)
...
@@ -316,7 +316,7 @@ float check_error(const Tensor<T>& ref, const Tensor<T>& result)
constexpr
float
eps
=
1e-10
;
constexpr
float
eps
=
1e-10
;
for
(
in
t
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
for
(
std
::
size_
t
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
{
{
float
ref_v
=
ck
::
type_convert
<
float
>
(
ref
.
mData
[
i
]);
float
ref_v
=
ck
::
type_convert
<
float
>
(
ref
.
mData
[
i
]);
float
result_v
=
ck
::
type_convert
<
float
>
(
result
.
mData
[
i
]);
float
result_v
=
ck
::
type_convert
<
float
>
(
result
.
mData
[
i
]);
...
...
Prev
1
…
4
5
6
7
8
9
10
11
12
…
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment