Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
dd6a8de4
Commit
dd6a8de4
authored
Apr 06, 2022
by
Jehandad Khan
Browse files
Merge branch 'develop' into jd/dev_pkg
parents
0aa899aa
abf4bdb9
Changes
470
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
740 additions
and
812 deletions
+740
-812
include/ck/utility/common_header.hpp
include/ck/utility/common_header.hpp
+9
-12
include/ck/utility/data_type.hpp
include/ck/utility/data_type.hpp
+3
-6
include/ck/utility/data_type_enum.hpp
include/ck/utility/data_type_enum.hpp
+1
-1
include/ck/utility/data_type_enum_helper.hpp
include/ck/utility/data_type_enum_helper.hpp
+11
-11
include/ck/utility/dynamic_buffer.hpp
include/ck/utility/dynamic_buffer.hpp
+152
-158
include/ck/utility/get_id.hpp
include/ck/utility/get_id.hpp
+1
-5
include/ck/utility/multi_index.hpp
include/ck/utility/multi_index.hpp
+1
-1
include/ck/utility/reduction_enums.hpp
include/ck/utility/reduction_enums.hpp
+4
-4
include/ck/utility/sequence.hpp
include/ck/utility/sequence.hpp
+6
-0
include/ck/utility/static_buffer.hpp
include/ck/utility/static_buffer.hpp
+5
-11
include/ck/utility/synchronization.hpp
include/ck/utility/synchronization.hpp
+1
-1
include/ck/utility/tensor_space_filling_curve.hpp
include/ck/utility/tensor_space_filling_curve.hpp
+13
-0
library/include/ck/library/host_tensor/conv_common.hpp
library/include/ck/library/host_tensor/conv_common.hpp
+4
-4
library/include/ck/library/host_tensor/device.hpp
library/include/ck/library/host_tensor/device.hpp
+2
-2
library/include/ck/library/host_tensor/device_tensor.hpp
library/include/ck/library/host_tensor/device_tensor.hpp
+0
-1
library/include/ck/library/host_tensor/host_generic_reduction.hpp
...include/ck/library/host_tensor/host_generic_reduction.hpp
+0
-424
library/include/ck/library/host_tensor/host_reduce_util.hpp
library/include/ck/library/host_tensor/host_reduce_util.hpp
+77
-76
library/include/ck/library/host_tensor/host_reduction.hpp
library/include/ck/library/host_tensor/host_reduction.hpp
+402
-0
library/include/ck/library/host_tensor/host_tensor.hpp
library/include/ck/library/host_tensor/host_tensor.hpp
+43
-73
library/include/ck/library/host_tensor/host_tensor_generator.hpp
.../include/ck/library/host_tensor/host_tensor_generator.hpp
+5
-22
No files found.
include/ck/utility/common_header.hpp
View file @
dd6a8de4
#ifndef CK_COMMON_HEADER_HPP
#define CK_COMMON_HEADER_HPP
#pragma once
#include "config.hpp"
#include "array.hpp"
#include "container_helper.hpp"
...
...
@@ -20,30 +18,29 @@
#include "number.hpp"
#include "sequence.hpp"
#include "sequence_helper.hpp"
#include "synchronization.hpp"
#include "tuple.hpp"
#include "tuple_helper.hpp"
#include "type.hpp"
#include "magic_division.hpp"
#include "utility.hpp"
#include "c_style_pointer_cast.hpp"
#include "amd_address_space.hpp"
#include "amd_buffer_addressing.hpp"
#include "static_buffer.hpp"
#include "dynamic_buffer.hpp"
#include "is_known_at_compile_time.hpp"
#include "transpose_vectors.hpp"
#include "inner_product.hpp"
#include "element_wise_operation.hpp"
#include "debug.hpp"
#include "amd_buffer_addressing.hpp"
#include "get_id.hpp"
#include "synchronization.hpp"
#include "amd_address_space.hpp"
#include "static_buffer.hpp"
#include "dynamic_buffer.hpp"
// TODO: remove this
#if CK_USE_AMD_INLINE_ASM
#include "amd_inline_asm.hpp"
#endif
#if CK_USE_AMD_
XDLOPS
#if
def
CK_USE_AMD_
MFMA
#include "amd_xdlops.hpp"
#endif
#endif
include/ck/utility/data_type.hpp
View file @
dd6a8de4
#ifndef CK_FLOAT_TYPE_AMD_HPP
#define CK_FLOAT_TYPE_AMD_HPP
#pragma once
#include "statically_indexed_array.hpp"
namespace
ck
{
...
...
@@ -937,7 +935,7 @@ __host__ __device__ Y type_convert(X x)
// convert bfp16 to fp32
template
<
>
inline
__host__
__device__
float
type_convert
(
bhalf_t
x
)
inline
__host__
__device__
float
type_convert
<
float
,
bhalf_t
>
(
bhalf_t
x
)
{
union
{
...
...
@@ -950,7 +948,7 @@ inline __host__ __device__ float type_convert(bhalf_t x)
// convert fp32 to bfp16
template
<
>
inline
__host__
__device__
bhalf_t
type_convert
(
float
x
)
inline
__host__
__device__
bhalf_t
type_convert
<
bhalf_t
,
float
>
(
float
x
)
{
union
{
...
...
@@ -1090,4 +1088,3 @@ struct NumericLimits<half_t>
};
}
// namespace ck
#endif
include/ck/utility/data_type_enum.hpp
View file @
dd6a8de4
...
...
@@ -3,7 +3,7 @@
namespace
ck
{
enum
DataTypeEnum
_t
enum
struct
DataTypeEnum
{
Half
=
0
,
Float
=
1
,
...
...
include/ck/utility/data_type_enum_helper.hpp
View file @
dd6a8de4
...
...
@@ -6,35 +6,35 @@
namespace
ck
{
template
<
DataTypeEnum
_t
DataTypeEnum
>
template
<
DataTypeEnum
DataTypeEnum
>
struct
get_datatype_from_enum
;
template
<
>
struct
get_datatype_from_enum
<
DataTypeEnum
_t
::
Int8
>
struct
get_datatype_from_enum
<
DataTypeEnum
::
Int8
>
{
using
type
=
int8_t
;
};
template
<
>
struct
get_datatype_from_enum
<
DataTypeEnum
_t
::
Int32
>
struct
get_datatype_from_enum
<
DataTypeEnum
::
Int32
>
{
using
type
=
int32_t
;
};
template
<
>
struct
get_datatype_from_enum
<
DataTypeEnum
_t
::
Half
>
struct
get_datatype_from_enum
<
DataTypeEnum
::
Half
>
{
using
type
=
half_t
;
};
template
<
>
struct
get_datatype_from_enum
<
DataTypeEnum
_t
::
Float
>
struct
get_datatype_from_enum
<
DataTypeEnum
::
Float
>
{
using
type
=
float
;
};
template
<
>
struct
get_datatype_from_enum
<
DataTypeEnum
_t
::
Double
>
struct
get_datatype_from_enum
<
DataTypeEnum
::
Double
>
{
using
type
=
double
;
};
...
...
@@ -45,31 +45,31 @@ struct get_datatype_enum_from_type;
template
<
>
struct
get_datatype_enum_from_type
<
int8_t
>
{
static
constexpr
DataTypeEnum
_t
value
=
DataTypeEnum
_t
::
Int8
;
static
constexpr
DataTypeEnum
value
=
DataTypeEnum
::
Int8
;
};
template
<
>
struct
get_datatype_enum_from_type
<
int32_t
>
{
static
constexpr
DataTypeEnum
_t
value
=
DataTypeEnum
_t
::
Int32
;
static
constexpr
DataTypeEnum
value
=
DataTypeEnum
::
Int32
;
};
template
<
>
struct
get_datatype_enum_from_type
<
half_t
>
{
static
constexpr
DataTypeEnum
_t
value
=
DataTypeEnum
_t
::
Half
;
static
constexpr
DataTypeEnum
value
=
DataTypeEnum
::
Half
;
};
template
<
>
struct
get_datatype_enum_from_type
<
float
>
{
static
constexpr
DataTypeEnum
_t
value
=
DataTypeEnum
_t
::
Float
;
static
constexpr
DataTypeEnum
value
=
DataTypeEnum
::
Float
;
};
template
<
>
struct
get_datatype_enum_from_type
<
double
>
{
static
constexpr
DataTypeEnum
_t
value
=
DataTypeEnum
_t
::
Double
;
static
constexpr
DataTypeEnum
value
=
DataTypeEnum
::
Double
;
};
}
// namespace ck
...
...
include/ck/utility/dynamic_buffer.hpp
View file @
dd6a8de4
#ifndef CK_BUFFER_HPP
#define CK_BUFFER_HPP
#pragma once
#include "amd_buffer_addressing.hpp"
#include "c_style_pointer_cast.hpp"
#include "config.hpp"
...
...
@@ -8,7 +6,7 @@
namespace
ck
{
template
<
AddressSpaceEnum
_t
BufferAddressSpace
,
template
<
AddressSpaceEnum
BufferAddressSpace
,
typename
T
,
typename
ElementSpaceSize
,
bool
InvalidElementUseNumericalZeroValue
>
...
...
@@ -34,7 +32,7 @@ struct DynamicBuffer
{
}
__host__
__device__
static
constexpr
AddressSpaceEnum
_t
GetAddressSpace
()
__host__
__device__
static
constexpr
AddressSpaceEnum
GetAddressSpace
()
{
return
BufferAddressSpace
;
}
...
...
@@ -55,7 +53,7 @@ struct DynamicBuffer
constexpr
index_t
scalar_per_x_vector
=
scalar_type
<
remove_cvref_t
<
X
>>::
vector_size
;
static_assert
(
scalar_per_x_vector
%
scalar_per_t_vector
==
0
,
"wrong! X
need to be
multiple T"
);
"wrong! X
should contain
multiple T"
);
#if CK_USE_AMD_BUFFER_LOAD
bool
constexpr
use_amd_buffer_addressing
=
true
;
...
...
@@ -63,7 +61,7 @@ struct DynamicBuffer
bool
constexpr
use_amd_buffer_addressing
=
false
;
#endif
if
constexpr
(
GetAddressSpace
()
==
AddressSpaceEnum
_t
::
Global
&&
use_amd_buffer_addressing
)
if
constexpr
(
GetAddressSpace
()
==
AddressSpaceEnum
::
Global
&&
use_amd_buffer_addressing
)
{
constexpr
index_t
t_per_x
=
scalar_per_x_vector
/
scalar_per_t_vector
;
...
...
@@ -81,50 +79,48 @@ struct DynamicBuffer
}
else
{
if
constexpr
(
In
valid
E
lement
UseNumericalZeroValue
)
if
(
is_
valid
_e
lement
)
{
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
X
tmp
;
__builtin_memcpy
(
&
tmp
,
&
(
p_data_
[
i
]),
sizeof
(
X
));
return
is_valid_element
?
tmp
:
X
{
0
}
;
return
tmp
;
#else
return
is_valid_element
?
*
c_style_pointer_cast
<
const
X
*>
(
&
p_data_
[
i
])
:
X
{
0
}
;
return
*
c_style_pointer_cast
<
const
X
*>
(
&
p_data_
[
i
]);
#endif
}
else
{
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
X
tmp
;
__builtin_memcpy
(
&
tmp
,
&
(
p_data_
[
i
]),
sizeof
(
X
));
return
is_valid_element
?
tmp
:
X
{
invalid_element_value_
};
#else
return
is_valid_element
?
*
c_style_pointer_cast
<
const
X
*>
(
&
p_data_
[
i
])
:
X
{
invalid_element_value_
};
#endif
if
constexpr
(
InvalidElementUseNumericalZeroValue
)
{
return
X
{
0
};
}
else
{
return
X
{
invalid_element_value_
};
}
}
}
}
template
<
InMemoryDataOperationEnum
_t
Op
,
template
<
InMemoryDataOperationEnum
Op
,
typename
X
,
typename
enable_if
<
is_same
<
typename
scalar_type
<
remove_cvref_t
<
X
>
>::
type
,
typename
scalar_type
<
remove_cvref_t
<
T
>>::
type
>::
value
,
bool
>::
type
=
false
>
__host__
__device__
void
Update
(
index_t
i
,
bool
is_valid_element
,
const
X
&
x
)
{
if
constexpr
(
Op
==
InMemoryDataOperationEnum
_t
::
Set
)
if
constexpr
(
Op
==
InMemoryDataOperationEnum
::
Set
)
{
this
->
template
Set
<
X
>(
i
,
is_valid_element
,
x
);
}
else
if
constexpr
(
Op
==
InMemoryDataOperationEnum
_t
::
AtomicAdd
)
else
if
constexpr
(
Op
==
InMemoryDataOperationEnum
::
AtomicAdd
)
{
this
->
template
AtomicAdd
<
X
>(
i
,
is_valid_element
,
x
);
}
else
if
constexpr
(
Op
==
InMemoryDataOperationEnum
_t
::
Add
)
else
if
constexpr
(
Op
==
InMemoryDataOperationEnum
::
Add
)
{
auto
tmp
=
this
->
template
Get
<
X
>(
i
,
is_valid_element
);
this
->
template
Set
<
X
>(
i
,
is_valid_element
,
x
+
tmp
);
...
...
@@ -145,143 +141,120 @@ struct DynamicBuffer
constexpr
index_t
scalar_per_x_vector
=
scalar_type
<
remove_cvref_t
<
X
>>::
vector_size
;
static_assert
(
scalar_per_x_vector
%
scalar_per_t_vector
==
0
,
"wrong! X
need to be
multiple T"
);
"wrong! X
should contain
multiple T"
);
if
constexpr
(
GetAddressSpace
()
==
AddressSpaceEnum_t
::
Global
)
{
#if CK_USE_AMD_BUFFER_STORE
constexpr
index_t
t_per_x
=
scalar_per_x_vector
/
scalar_per_t_vector
;
amd_buffer_store
<
remove_cvref_t
<
T
>
,
t_per_x
>
(
x
,
p_data_
,
i
,
is_valid_element
,
element_space_size_
);
bool
constexpr
use_amd_buffer_addressing
=
true
;
#else
if
(
is_valid_element
)
{
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
X
tmp
=
x
;
bool
constexpr
use_amd_buffer_addressing
=
false
;
#endif
__builtin_memcpy
(
&
(
p_data_
[
i
]),
&
tmp
,
sizeof
(
X
));
#if CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
bool
constexpr
workaround_int8_ds_write_issue
=
true
;
#else
*
c_style_pointer_cast
<
X
*>
(
&
p_data_
[
i
])
=
x
;
#endif
}
bool
constexpr
workaround_int8_ds_write_issue
=
false
;
#endif
if
constexpr
(
GetAddressSpace
()
==
AddressSpaceEnum
::
Global
&&
use_amd_buffer_addressing
)
{
constexpr
index_t
t_per_x
=
scalar_per_x_vector
/
scalar_per_t_vector
;
amd_buffer_store
<
remove_cvref_t
<
T
>
,
t_per_x
>
(
x
,
p_data_
,
i
,
is_valid_element
,
element_space_size_
);
}
else
if
constexpr
(
GetAddressSpace
()
==
AddressSpaceEnum_t
::
Lds
)
else
if
constexpr
(
GetAddressSpace
()
==
AddressSpaceEnum
::
Lds
&&
is_same
<
typename
scalar_type
<
remove_cvref_t
<
T
>>::
type
,
int8_t
>::
value
&&
workaround_int8_ds_write_issue
)
{
if
(
is_valid_element
)
{
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
X
tmp
=
x
;
__builtin_memcpy
(
&
(
p_data_
[
i
]),
&
tmp
,
sizeof
(
X
));
#else
*
c_style_pointer_cast
<
X
*>
(
&
p_data_
[
i
])
=
x
;
#endif
#else
// HACK: compiler would lower IR "store<i8, 16> address_space(3)" into
// inefficient
// HACK: compiler would lower IR "store<i8, 16> address_space(3)" into inefficient
// ISA, so I try to let compiler emit IR "store<i32, 4>" which would be lower to
// ds_write_b128
// TODO: remove this after compiler fix
if
constexpr
(
is_same
<
typename
scalar_type
<
remove_cvref_t
<
T
>>::
type
,
int8_t
>::
value
)
static_assert
((
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x2_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x4_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x8_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x16_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8x4_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x4_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8x8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x8_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8x16_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x16_t
>::
value
),
"wrong! not implemented for this combination, please add "
"implementation"
);
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8_t
>::
value
)
{
static_assert
((
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x2_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x4_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x8_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x16_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8x4_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x4_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8x8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x8_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8x16_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x16_t
>::
value
),
"wrong! not implemented for this combination, please add "
"implementation"
);
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int8_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int8_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x2_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int16_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int16_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x4_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int32_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int32_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x8_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int32x2_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int32x2_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x16_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int32x4_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int32x4_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8x4_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x4_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int32_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int32_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8x8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x8_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int32x2_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int32x2_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8x16_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x16_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int32x4_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int32x4_t
*>
(
&
x
);
}
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int8_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int8_t
*>
(
&
x
);
}
else
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x2_t
>::
value
)
{
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
X
tmp
=
x
;
__builtin_memcpy
(
&
(
p_data_
[
i
]),
&
tmp
,
sizeof
(
X
));
#else
*
c_style_pointer_cast
<
X
*>
(
&
p_data_
[
i
])
=
x
;
#endif
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int16_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int16_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x4_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int32_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int32_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x8_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int32x2_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int32x2_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x16_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int32x4_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int32x4_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8x4_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x4_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int32_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int32_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8x8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x8_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int32x2_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int32x2_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8x16_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x16_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int32x4_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int32x4_t
*>
(
&
x
);
}
#endif
}
}
else
...
...
@@ -305,27 +278,49 @@ struct DynamicBuffer
bool
>::
type
=
false
>
__host__
__device__
void
AtomicAdd
(
index_t
i
,
bool
is_valid_element
,
const
X
&
x
)
{
using
scalar_t
=
typename
scalar_type
<
remove_cvref_t
<
T
>>::
type
;
// X contains multiple T
constexpr
index_t
scalar_per_t_vector
=
scalar_type
<
remove_cvref_t
<
T
>>::
vector_size
;
constexpr
index_t
scalar_per_x_vector
=
scalar_type
<
remove_cvref_t
<
X
>>::
vector_size
;
static_assert
(
scalar_per_x_vector
%
scalar_per_t_vector
==
0
,
"wrong! X need to be multiple T"
);
static_assert
(
GetAddressSpace
()
==
AddressSpaceEnum_t
::
Global
,
"only support global mem"
);
"wrong! X should contain multiple T"
);
static_assert
(
GetAddressSpace
()
==
AddressSpaceEnum
::
Global
,
"only support global mem"
);
#if CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
bool
constexpr
use_amd_buffer_addressing
=
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
int32_t
>
||
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
float
>
||
(
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
half_t
>
&&
scalar_per_x_vector
%
2
==
0
);
#elif CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT)
bool
constexpr
use_amd_buffer_addressing
=
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
int32_t
>
;
#elif(!CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
bool
constexpr
use_amd_buffer_addressing
=
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
float
>
||
(
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
half_t
>
&&
scalar_per_x_vector
%
2
==
0
);
#else
bool
constexpr
use_amd_buffer_addressing
=
false
;
#endif
#if CK_USE_AMD_BUFFER_ATOMIC_ADD
constexpr
index_t
t_per_x
=
scalar_per_x_vector
/
scalar_per_t_vector
;
if
constexpr
(
use_amd_buffer_addressing
)
{
constexpr
index_t
t_per_x
=
scalar_per_x_vector
/
scalar_per_t_vector
;
amd_buffer_atomic_add
<
remove_cvref_t
<
T
>
,
t_per_x
>
(
x
,
p_data_
,
i
,
is_valid_element
,
element_space_size_
);
#else
if
(
is_valid_element
)
amd_buffer_atomic_add
<
remove_cvref_t
<
T
>
,
t_per_x
>
(
x
,
p_data_
,
i
,
is_valid_element
,
element_space_size_
);
}
else
{
atomicAdd
(
&
p_data_
[
i
],
x
);
if
(
is_valid_element
)
{
// FIXME: atomicAdd is defined by HIP, need to avoid implicit type casting when
// calling it
atomicAdd
(
c_style_pointer_cast
<
X
*>
(
&
p_data_
[
i
]),
x
);
}
}
#endif
}
__host__
__device__
static
constexpr
bool
IsStaticBuffer
()
{
return
false
;
}
...
...
@@ -333,14 +328,14 @@ struct DynamicBuffer
__host__
__device__
static
constexpr
bool
IsDynamicBuffer
()
{
return
true
;
}
};
template
<
AddressSpaceEnum
_t
BufferAddressSpace
,
typename
T
,
typename
ElementSpaceSize
>
template
<
AddressSpaceEnum
BufferAddressSpace
,
typename
T
,
typename
ElementSpaceSize
>
__host__
__device__
constexpr
auto
make_dynamic_buffer
(
T
*
p
,
ElementSpaceSize
element_space_size
)
{
return
DynamicBuffer
<
BufferAddressSpace
,
T
,
ElementSpaceSize
,
true
>
{
p
,
element_space_size
};
}
template
<
AddressSpaceEnum
_t
BufferAddressSpace
,
AddressSpaceEnum
BufferAddressSpace
,
typename
T
,
typename
ElementSpaceSize
,
typename
X
,
...
...
@@ -353,4 +348,3 @@ make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element
}
}
// namespace ck
#endif
include/ck/utility/
utility
.hpp
→
include/ck/utility/
get_id
.hpp
View file @
dd6a8de4
#ifndef CK_UTILITY_HPP
#define CK_UTILITY_HPP
#pragma once
#include "config.hpp"
namespace
ck
{
...
...
@@ -16,5 +14,3 @@ __device__ index_t get_block_1d_id() { return blockIdx.x; }
__device__
index_t
get_grid_size
()
{
return
gridDim
.
x
;
}
}
// namespace ck
#endif
include/ck/utility/multi_index.hpp
View file @
dd6a8de4
...
...
@@ -3,7 +3,7 @@
#include "common_header.hpp"
#if CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX
#if CK_
EXPERIMENTAL_
USE_DYNAMICALLY_INDEXED_MULTI_INDEX
#include "array_multi_index.hpp"
#else
#include "statically_indexed_array_multi_index.hpp"
...
...
include/ck/utility/reduction_enums.hpp
View file @
dd6a8de4
...
...
@@ -28,7 +28,7 @@
namespace
ck
{
enum
class
ReduceTensorOp
_t
enum
struct
ReduceTensorOp
{
ADD
=
0
,
MUL
=
1
,
...
...
@@ -41,19 +41,19 @@ enum class ReduceTensorOp_t
// MUL_NO_ZEROS = 8,
};
enum
class
NanPropagation
_t
enum
struct
NanPropagation
{
NOT_PROPAGATE_NAN
=
0
,
PROPAGATE_NAN
=
1
,
};
enum
class
ReduceTensorIndices
_t
enum
struct
ReduceTensorIndices
{
NO_INDICES
=
0
,
FLATTENED_INDICES
=
1
,
};
enum
class
IndicesType
_t
enum
struct
IndicesType
{
INDICES_32BIT
=
0
,
INDICES_64BIT
=
1
,
...
...
include/ck/utility/sequence.hpp
View file @
dd6a8de4
...
...
@@ -606,6 +606,12 @@ struct sequence_map_inverse
SeqMap
::
Size
()
>::
type
;
};
template
<
index_t
...
Xs
,
index_t
...
Ys
>
__host__
__device__
constexpr
bool
operator
==
(
Sequence
<
Xs
...
>
,
Sequence
<
Ys
...
>
)
{
return
((
Xs
==
Ys
)
&&
...);
}
template
<
index_t
...
Xs
,
index_t
...
Ys
>
__host__
__device__
constexpr
auto
operator
+
(
Sequence
<
Xs
...
>
,
Sequence
<
Ys
...
>
)
{
...
...
include/ck/utility/static_buffer.hpp
View file @
dd6a8de4
...
...
@@ -6,7 +6,7 @@
namespace
ck
{
// static buffer for scalar
template
<
AddressSpaceEnum
_t
AddressSpace
,
template
<
AddressSpaceEnum
AddressSpace
,
typename
T
,
index_t
N
,
bool
InvalidElementUseNumericalZeroValue
>
// TODO remove this bool, no longer needed
...
...
@@ -17,10 +17,7 @@ struct StaticBuffer : public StaticallyIndexedArray<T, N>
__host__
__device__
constexpr
StaticBuffer
()
:
base
{}
{}
__host__
__device__
static
constexpr
AddressSpaceEnum_t
GetAddressSpace
()
{
return
AddressSpace
;
}
__host__
__device__
static
constexpr
AddressSpaceEnum
GetAddressSpace
()
{
return
AddressSpace
;
}
__host__
__device__
static
constexpr
bool
IsStaticBuffer
()
{
return
true
;
}
...
...
@@ -42,7 +39,7 @@ struct StaticBuffer : public StaticallyIndexedArray<T, N>
};
// static buffer for vector
template
<
AddressSpaceEnum
_t
AddressSpace
,
template
<
AddressSpaceEnum
AddressSpace
,
typename
S
,
index_t
NumOfVector
,
index_t
ScalarPerVector
,
...
...
@@ -59,10 +56,7 @@ struct StaticBufferTupleOfVector
__host__
__device__
constexpr
StaticBufferTupleOfVector
()
:
base
{}
{}
__host__
__device__
static
constexpr
AddressSpaceEnum_t
GetAddressSpace
()
{
return
AddressSpace
;
}
__host__
__device__
static
constexpr
AddressSpaceEnum
GetAddressSpace
()
{
return
AddressSpace
;
}
__host__
__device__
static
constexpr
bool
IsStaticBuffer
()
{
return
true
;
}
...
...
@@ -158,7 +152,7 @@ struct StaticBufferTupleOfVector
}
};
template
<
AddressSpaceEnum
_t
AddressSpace
,
typename
T
,
index_t
N
>
template
<
AddressSpaceEnum
AddressSpace
,
typename
T
,
index_t
N
>
__host__
__device__
constexpr
auto
make_static_buffer
(
Number
<
N
>
)
{
return
StaticBuffer
<
AddressSpace
,
T
,
N
,
true
>
{};
...
...
include/ck/utility/synchronization.hpp
View file @
dd6a8de4
...
...
@@ -7,7 +7,7 @@ namespace ck {
__device__
void
block_sync_lds
()
{
#if CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
#if CK_
EXPERIMENTAL_
BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
asm
volatile
(
"\
s_waitcnt lgkmcnt(0)
\n
\
s_barrier \
...
...
include/ck/utility/tensor_space_filling_curve.hpp
View file @
dd6a8de4
...
...
@@ -37,6 +37,10 @@ struct SpaceFillingCurve
__host__
__device__
static
constexpr
index_t
GetNumOfAccess
()
{
static_assert
(
TensorLengths
::
Size
()
==
ScalarsPerAccess
::
Size
());
static_assert
(
TensorLengths
{}
%
ScalarsPerAccess
{}
==
typename
uniform_sequence_gen
<
TensorLengths
::
Size
(),
0
>::
type
{});
return
reduce_on_sequence
(
TensorLengths
{},
math
::
multiplies
{},
Number
<
1
>
{})
/
ScalarPerVector
;
}
...
...
@@ -140,6 +144,15 @@ struct SpaceFillingCurve
}();
return
idx_md
;
}
// FIXME: rename this function
template
<
index_t
AccessIdx1d
>
static
__device__
__host__
constexpr
auto
GetIndexTupleOfNumber
(
Number
<
AccessIdx1d
>
)
{
constexpr
auto
idx
=
GetIndex
(
Number
<
AccessIdx1d
>
{});
return
generate_tuple
([
&
](
auto
i
)
{
return
Number
<
idx
[
i
]
>
{};
},
Number
<
nDim
>
{});
}
};
}
// namespace ck
...
...
library/include/ck/library/host_tensor/conv_common.hpp
View file @
dd6a8de4
...
...
@@ -75,14 +75,14 @@ calculate_convolution_flops(const InDesc&, const WeiDesc& wei_desc, const OutDes
}
template
<
typename
T
>
inline
auto
activ
(
T
v
,
const
ck
::
ActivTypeEnum
_t
activ_type
)
inline
auto
activ
(
T
v
,
const
ck
::
ActivTypeEnum
activ_type
)
{
const
T
alpha
=
0.3
;
switch
(
activ_type
)
{
case
ck
::
ActivTypeEnum
_t
::
None
:
return
v
;
case
ck
::
ActivTypeEnum
_t
::
LeakyRelu
:
return
(
v
>=
0
?
v
:
alpha
*
v
);
case
ck
::
ActivTypeEnum
_t
::
Sigmoid
:
return
(
1
/
(
1
+
exp
(
-
v
)));
case
ck
::
ActivTypeEnum
::
None
:
return
v
;
case
ck
::
ActivTypeEnum
::
LeakyRelu
:
return
(
v
>=
0
?
v
:
alpha
*
v
);
case
ck
::
ActivTypeEnum
::
Sigmoid
:
return
(
1
/
(
1
+
exp
(
-
v
)));
default:
throw
std
::
runtime_error
(
"unsupported activ type"
);
break
;
}
}
...
...
library/include/ck/library/host_tensor/device.hpp
View file @
dd6a8de4
...
...
@@ -48,8 +48,10 @@ struct DeviceMem
DeviceMem
()
=
delete
;
DeviceMem
(
std
::
size_t
mem_size
);
void
*
GetDeviceBuffer
();
std
::
size_t
GetBufferSize
();
void
ToDevice
(
const
void
*
p
);
void
FromDevice
(
void
*
p
);
void
SetZero
();
~
DeviceMem
();
void
*
mpDeviceBuf
;
...
...
@@ -109,8 +111,6 @@ float launch_and_time_kernel(
timer
.
End
();
// std::this_thread::sleep_for (std::chrono::microseconds(10));
return
timer
.
GetElapsedTime
()
/
nrepeat
;
#else
std
::
ignore
=
nrepeat
;
...
...
library/include/ck/library/host_tensor/device_tensor.hpp
View file @
dd6a8de4
#pragma once
#include "host_tensor.hpp"
#include "common_header.hpp"
template
<
typename
TensorDesc
>
void
ostream_tensor_descriptor
(
TensorDesc
,
std
::
ostream
&
os
=
std
::
cout
)
...
...
library/include/ck/library/host_tensor/host_generic_reduction.hpp
deleted
100644 → 0
View file @
0aa899aa
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef HOST_GENERIC_REDUCTION_HPP_
#define HOST_GENERIC_REDUCTION_HPP_
#include <vector>
#include <functional>
#include <limits>
#include <type_traits>
#include <cassert>
#include <cmath>
#include "reduction_enums.hpp"
#include "host_reduce_util.hpp"
using
float16
=
half_float
::
half
;
namespace
ck
{
namespace
host_reduce
{
template
<
typename
T
>
static
void
get_all_indexes
(
const
std
::
vector
<
T
>&
dimLengths
,
int
dim
,
std
::
vector
<
std
::
vector
<
T
>>&
indexes
)
{
if
(
dim
<
dimLengths
.
size
())
{
std
::
vector
<
std
::
vector
<
T
>>
updated_indexes
;
if
(
dim
==
0
)
{
assert
(
indexes
.
size
()
==
0
);
assert
(
dimLengths
[
dim
]
>
0
);
for
(
T
i
=
0
;
i
<
dimLengths
[
dim
];
i
++
)
{
std
::
vector
<
T
>
index
=
{
i
};
updated_indexes
.
push_back
(
index
);
};
}
else
{
// go through all the current indexes
for
(
const
auto
&
index
:
indexes
)
for
(
T
i
=
0
;
i
<
dimLengths
[
dim
];
i
++
)
{
auto
index_new
=
index
;
index_new
.
push_back
(
i
);
updated_indexes
.
push_back
(
index_new
);
};
};
// update to the indexes (output)
indexes
=
updated_indexes
;
// further to construct the indexes from the updated status
get_all_indexes
(
dimLengths
,
dim
+
1
,
indexes
);
};
};
template
<
typename
T
>
static
T
get_offset_from_index
(
const
std
::
vector
<
T
>&
strides
,
const
std
::
vector
<
T
>&
index
)
{
T
offset
=
0
;
assert
(
strides
.
size
()
==
index
.
size
());
for
(
int
i
=
0
;
i
<
index
.
size
();
i
++
)
offset
+=
strides
[
i
]
*
static_cast
<
T
>
(
index
[
i
]);
return
(
offset
);
};
template
<
typename
T
>
static
inline
T
get_flatten_offset
(
const
std
::
vector
<
T
>&
lengths
,
const
std
::
vector
<
T
>&
index
)
{
T
offset
=
0
;
assert
(
lengths
.
size
()
==
index
.
size
()
&&
lengths
.
size
()
>
0
);
int
len
=
lengths
.
size
();
T
stride
=
1
;
// for len==1, the loop is not executed
for
(
int
i
=
len
-
1
;
i
>
0
;
i
--
)
{
offset
+=
stride
*
static_cast
<
T
>
(
index
[
i
]);
stride
*=
lengths
[
i
];
};
offset
+=
stride
*
static_cast
<
T
>
(
index
[
0
]);
return
(
offset
);
};
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
ck
::
ReduceTensorOp_t
ReduceOpId
,
bool
PropagateNan
,
bool
NeedIndices
>
class
ReductionHost
{
public:
ReductionHost
()
=
default
;
ReductionHost
(
HostTensorDescriptor
&
inDesc
,
HostTensorDescriptor
&
outDesc
,
const
std
::
vector
<
int
>&
invariantDims_
,
const
std
::
vector
<
int
>&
toReduceDims_
)
{
this
->
inLengths
=
to_int_vector
(
inDesc
.
GetLengths
());
this
->
outLengths
=
to_int_vector
(
outDesc
.
GetLengths
());
this
->
inStrides
=
to_int_vector
(
inDesc
.
GetStrides
());
this
->
outStrides
=
to_int_vector
(
outDesc
.
GetStrides
());
this
->
invariantDims
=
invariantDims_
;
this
->
toReduceDims
=
toReduceDims_
;
assert
(
this
->
inLengths
.
size
()
==
this
->
outLengths
.
size
());
assert
(
!
this
->
toReduceDims
.
empty
());
for
(
const
auto
dim
:
this
->
invariantDims
)
this
->
invariantLengths
.
push_back
(
this
->
inLengths
[
dim
]);
for
(
const
auto
dim
:
this
->
toReduceDims
)
toReduceLengths
.
push_back
(
this
->
inLengths
[
dim
]);
this
->
reduceAllDims
=
this
->
invariantDims
.
empty
();
};
~
ReductionHost
(){};
void
Run
(
float
alpha
,
const
InDataType
*
in_data
,
float
beta
,
OutDataType
*
out_data
,
int
*
indices
)
{
if
constexpr
(
NeedIndices
)
RunImpl_with_indices
(
alpha
,
in_data
,
beta
,
out_data
,
indices
);
else
RunImpl_no_indices
(
alpha
,
in_data
,
beta
,
out_data
);
};
private:
std
::
vector
<
int
>
inLengths
;
std
::
vector
<
int
>
outLengths
;
std
::
vector
<
int
>
inStrides
;
std
::
vector
<
int
>
outStrides
;
std
::
vector
<
int
>
invariantLengths
;
std
::
vector
<
int
>
toReduceLengths
;
std
::
vector
<
int
>
invariantDims
;
std
::
vector
<
int
>
toReduceDims
;
bool
reduceAllDims
;
void
RunImpl_with_indices
(
float
alpha
,
const
InDataType
*
in_data
,
float
beta
,
OutDataType
*
out_data
,
int
*
indices
)
{
using
ck
::
host_reduce
::
binop_with_nan_check
;
using
ck
::
host_reduce
::
binop_with_nan_check2
;
using
ck
::
host_reduce
::
float_equal_one
;
using
ck
::
host_reduce
::
float_equal_zero
;
using
ck
::
host_reduce
::
PosUnaryOpFn
;
using
ck
::
host_reduce
::
PreUnaryOpFn
;
using
ck
::
host_reduce
::
ReduceOpFn2
;
using
ck
::
host_reduce
::
ReduceOpZeroVal
;
auto
opReduce
=
ReduceOpFn2
<
AccDataType
,
ReduceOpId
>
();
int
divider
=
1
;
for
(
int
i
=
0
;
i
<
toReduceLengths
.
size
();
i
++
)
divider
*=
toReduceLengths
[
i
];
auto
PreUnaryOp
=
PreUnaryOpFn
<
AccDataType
,
ReduceOpId
>
(
divider
);
auto
PosUnaryOp
=
PosUnaryOpFn
<
AccDataType
,
ReduceOpId
>
(
divider
);
if
(
reduceAllDims
)
{
std
::
vector
<
std
::
vector
<
int
>>
indexes_1
;
get_all_indexes
(
inLengths
,
0
,
indexes_1
);
// generate the input indexes space
auto
accuVal
=
ReduceOpZeroVal
<
AccDataType
,
ReduceOpId
>
();
int
accuIndex
=
0
;
// go through indexes of the invariant dimensions
for
(
const
auto
&
src_index
:
indexes_1
)
{
auto
src_offset
=
get_offset_from_index
(
this
->
inStrides
,
src_index
);
auto
currVal
=
static_cast
<
AccDataType
>
(
in_data
[
src_offset
]);
// unary operation before reducing, needed by AMAX. For MIN/MAX, nothing is actually
// done
PreUnaryOp
(
currVal
);
auto
currIndex
=
get_flatten_offset
(
inLengths
,
src_index
);
binop_with_nan_check2
<
AccDataType
,
PropagateNan
>
(
opReduce
,
accuVal
,
currVal
,
accuIndex
,
currIndex
);
};
// scale the accumulated value
if
(
!
float_equal_one
(
alpha
))
accuVal
*=
static_cast
<
AccDataType
>
(
alpha
);
// scale the prior dst value and add it to the accumulated value
if
(
!
float_equal_zero
(
beta
))
accuVal
+=
static_cast
<
AccDataType
>
(
out_data
[
0
])
*
static_cast
<
AccDataType
>
(
beta
);
// store the reduced value to dst location
out_data
[
0
]
=
static_cast
<
OutDataType
>
(
accuVal
);
indices
[
0
]
=
accuIndex
;
}
else
{
std
::
vector
<
std
::
vector
<
int
>>
indexes_1
,
indexes_2
;
get_all_indexes
(
this
->
invariantLengths
,
0
,
indexes_1
);
// generate the invariant indexes space
get_all_indexes
(
this
->
toReduceLengths
,
0
,
indexes_2
);
// generate the toReduce indexes space
// go through indexes of the invariant dimensions
for
(
const
auto
&
index_1
:
indexes_1
)
{
std
::
vector
<
int
>
src_index
;
std
::
vector
<
int
>
dst_index
;
src_index
.
resize
(
this
->
inLengths
.
size
());
// generate the part of src index belonging to invariant dims
for
(
int
k
=
0
;
k
<
invariantDims
.
size
();
k
++
)
src_index
[
invariantDims
[
k
]]
=
index_1
[
k
];
for
(
int
k
=
0
;
k
<
invariantDims
.
size
();
k
++
)
dst_index
.
push_back
(
index_1
[
k
]);
int
dst_offset
=
get_offset_from_index
(
this
->
outStrides
,
dst_index
);
AccDataType
accuVal
=
ReduceOpZeroVal
<
AccDataType
,
ReduceOpId
>
();
int
accuIndex
=
0
;
// go through indexes of the toReduce dimensions
for
(
const
auto
&
index_2
:
indexes_2
)
{
// generate the part of src index belonging to toReduce dims
for
(
int
k
=
0
;
k
<
toReduceDims
.
size
();
k
++
)
src_index
[
toReduceDims
[
k
]]
=
index_2
[
k
];
auto
src_offset
=
get_offset_from_index
(
this
->
inStrides
,
src_index
);
auto
currVal
=
static_cast
<
AccDataType
>
(
in_data
[
src_offset
]);
// unary operation before reducing, needed by AMAX. For MIN/MAX, nothing is
// actually done
PreUnaryOp
(
currVal
);
auto
currIndex
=
get_flatten_offset
(
toReduceLengths
,
index_2
);
binop_with_nan_check2
<
AccDataType
,
PropagateNan
>
(
opReduce
,
accuVal
,
currVal
,
accuIndex
,
currIndex
);
};
// scale the accumulated value
if
(
!
float_equal_one
(
alpha
))
accuVal
*=
static_cast
<
AccDataType
>
(
alpha
);
// scale the prior dst value and add it to the accumulated value
if
(
!
float_equal_zero
(
beta
))
accuVal
+=
static_cast
<
AccDataType
>
(
out_data
[
dst_offset
])
*
static_cast
<
AccDataType
>
(
beta
);
// store the reduced value to dst location
out_data
[
dst_offset
]
=
static_cast
<
OutDataType
>
(
accuVal
);
indices
[
dst_offset
]
=
accuIndex
;
};
};
};
// end of RunImpl_with_indices()
void
RunImpl_no_indices
(
float
alpha
,
const
InDataType
*
in_data
,
float
beta
,
OutDataType
*
out_data
)
{
using
ck
::
host_reduce
::
binop_with_nan_check
;
using
ck
::
host_reduce
::
binop_with_nan_check2
;
using
ck
::
host_reduce
::
float_equal_one
;
using
ck
::
host_reduce
::
float_equal_zero
;
using
ck
::
host_reduce
::
PosUnaryOpFn
;
using
ck
::
host_reduce
::
PreUnaryOpFn
;
using
ck
::
host_reduce
::
ReduceOpFn
;
using
ck
::
host_reduce
::
ReduceOpZeroVal
;
auto
opReduce
=
ReduceOpFn
<
AccDataType
,
ReduceOpId
>
();
int
divider
=
1
;
for
(
int
i
=
0
;
i
<
toReduceLengths
.
size
();
i
++
)
divider
*=
toReduceLengths
[
i
];
auto
PreUnaryOp
=
PreUnaryOpFn
<
AccDataType
,
ReduceOpId
>
(
divider
);
auto
PosUnaryOp
=
PosUnaryOpFn
<
AccDataType
,
ReduceOpId
>
(
divider
);
if
(
reduceAllDims
)
{
std
::
vector
<
std
::
vector
<
int
>>
indexes_1
;
get_all_indexes
(
inLengths
,
0
,
indexes_1
);
// generate the input indexes space
auto
accuVal
=
ReduceOpZeroVal
<
AccDataType
,
ReduceOpId
>
();
// go through indexes of the invariant dimensions
for
(
const
auto
&
src_index
:
indexes_1
)
{
auto
src_offset
=
get_offset_from_index
(
this
->
inStrides
,
src_index
);
auto
currVal
=
static_cast
<
AccDataType
>
(
in_data
[
src_offset
]);
PreUnaryOp
(
currVal
);
binop_with_nan_check
<
AccDataType
,
PropagateNan
>
(
opReduce
,
accuVal
,
currVal
);
};
PosUnaryOp
(
accuVal
);
// scale the accumulated value
if
(
!
float_equal_one
(
alpha
))
accuVal
*=
static_cast
<
AccDataType
>
(
alpha
);
// scale the prior dst value and add it to the accumulated value
if
(
!
float_equal_zero
(
beta
))
accuVal
+=
static_cast
<
AccDataType
>
(
out_data
[
0
])
*
static_cast
<
AccDataType
>
(
beta
);
// store the reduced value to dst location
out_data
[
0
]
=
static_cast
<
OutDataType
>
(
accuVal
);
}
else
{
std
::
vector
<
std
::
vector
<
int
>>
indexes_1
,
indexes_2
;
get_all_indexes
(
this
->
invariantLengths
,
0
,
indexes_1
);
// generate the invariant indexes space
get_all_indexes
(
this
->
toReduceLengths
,
0
,
indexes_2
);
// generate the toReduce indexes space
// go through indexes of the invariant dimensions
for
(
const
auto
&
index_1
:
indexes_1
)
{
std
::
vector
<
int
>
src_index
;
std
::
vector
<
int
>
dst_index
;
src_index
.
resize
(
this
->
inLengths
.
size
());
for
(
int
k
=
0
;
k
<
invariantDims
.
size
();
k
++
)
dst_index
.
push_back
(
index_1
[
k
]);
int
dst_offset
=
get_offset_from_index
(
this
->
outStrides
,
dst_index
);
// generate the part of src index belonging to invariant dims
for
(
int
k
=
0
;
k
<
invariantDims
.
size
();
k
++
)
src_index
[
invariantDims
[
k
]]
=
index_1
[
k
];
AccDataType
accuVal
=
ReduceOpZeroVal
<
AccDataType
,
ReduceOpId
>
();
// go through indexes of the toReduce dimensions
for
(
const
auto
&
index_2
:
indexes_2
)
{
// generate the part of src index belonging to toReduce dims
for
(
int
k
=
0
;
k
<
toReduceDims
.
size
();
k
++
)
src_index
[
toReduceDims
[
k
]]
=
index_2
[
k
];
auto
src_offset
=
get_offset_from_index
(
this
->
inStrides
,
src_index
);
auto
currVal
=
static_cast
<
AccDataType
>
(
in_data
[
src_offset
]);
PreUnaryOp
(
currVal
);
binop_with_nan_check
<
AccDataType
,
PropagateNan
>
(
opReduce
,
accuVal
,
currVal
);
};
PosUnaryOp
(
accuVal
);
// scale the accumulated value
if
(
!
float_equal_one
(
alpha
))
accuVal
*=
static_cast
<
AccDataType
>
(
alpha
);
// scale the prior dst value and add it to the accumulated value
if
(
!
float_equal_zero
(
beta
))
accuVal
+=
static_cast
<
AccDataType
>
(
out_data
[
dst_offset
])
*
static_cast
<
AccDataType
>
(
beta
);
// store the reduced value to dst location
out_data
[
dst_offset
]
=
static_cast
<
OutDataType
>
(
accuVal
);
};
};
};
// end of RunImpl_no_indices()
};
};
// end of namespace host_reduce
};
// end of namespace ck
#endif
library/include/ck/library/host_tensor/host_reduce_util.hpp
View file @
dd6a8de4
...
...
@@ -39,8 +39,8 @@ namespace ck {
namespace
host_reduce
{
using
ck
::
NanPropagation
_t
;
using
ck
::
ReduceTensorOp
_t
;
using
ck
::
NanPropagation
;
using
ck
::
ReduceTensorOp
;
template
<
typename
T
>
static
inline
bool
float_equal_one
(
T
);
...
...
@@ -66,95 +66,95 @@ static inline bool float_equal_zero(half_float::half x)
return
x
==
static_cast
<
half_float
::
half
>
(
0.0
f
);
};
template
<
typename
comp
Type
,
ReduceTensorOp
_t
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
comp
Type
&
)
>
PreUnaryOpFn
(
int
)
template
<
typename
AccData
Type
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccData
Type
&
)
>
PreUnaryOpFn
(
int
)
{
using
std
::
abs
;
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
NORM1
)
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
NORM1
)
{
return
([
&
](
comp
Type
&
a_
)
{
a_
=
abs
(
a_
);
});
return
([
&
](
AccData
Type
&
a_
)
{
a_
=
abs
(
a_
);
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
NORM2
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
NORM2
)
{
return
([
&
](
comp
Type
&
a_
)
{
a_
=
a_
*
a_
;
});
return
([
&
](
AccData
Type
&
a_
)
{
a_
=
a_
*
a_
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
AMAX
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
AMAX
)
{
return
([
&
](
comp
Type
&
a_
)
{
a_
=
abs
(
a_
);
});
return
([
&
](
AccData
Type
&
a_
)
{
a_
=
abs
(
a_
);
});
}
else
{
// ReduceTensorOp
_t
::AVG:
// ReduceTensorOp
_t
::ADD:
// ReduceTensorOp
_t
::MUL:
// ReduceTensorOp
_t
::MIN:
// ReduceTensorOp
_t
::MAX:
return
([
&
](
comp
Type
&
)
{});
// ReduceTensorOp::AVG:
// ReduceTensorOp::ADD:
// ReduceTensorOp::MUL:
// ReduceTensorOp::MIN:
// ReduceTensorOp::MAX:
return
([
&
](
AccData
Type
&
)
{});
};
};
template
<
typename
comp
Type
,
ReduceTensorOp
_t
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
comp
Type
&
)
>
PosUnaryOpFn
(
int
divider
)
template
<
typename
AccData
Type
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccData
Type
&
)
>
PosUnaryOpFn
(
int
32_t
divider
)
{
using
std
::
sqrt
;
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
NORM2
)
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
NORM2
)
{
return
([
&
](
comp
Type
&
a_
)
{
a_
=
sqrt
(
a_
);
});
return
([
&
](
AccData
Type
&
a_
)
{
a_
=
sqrt
(
a_
);
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
AVG
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
AVG
)
{
return
([
&
,
divider
](
comp
Type
&
a_
)
{
a_
=
a_
/
static_cast
<
comp
Type
>
(
static_cast
<
float
>
(
divider
));
return
([
&
,
divider
](
AccData
Type
&
a_
)
{
a_
=
a_
/
static_cast
<
AccData
Type
>
(
static_cast
<
float
>
(
divider
));
});
}
else
{
// ReduceTensorOp
_t
::ADD:
// ReduceTensorOp
_t
::NORM1:
// ReduceTensorOp
_t
::MUL:
// ReduceTensorOp
_t
::MIN:
// ReduceTensorOp
_t
::MAX:
// ReduceTensorOp
_t
::AMAX:
return
([
&
](
comp
Type
&
)
{});
// ReduceTensorOp::ADD:
// ReduceTensorOp::NORM1:
// ReduceTensorOp::MUL:
// ReduceTensorOp::MIN:
// ReduceTensorOp::MAX:
// ReduceTensorOp::AMAX:
return
([
&
](
AccData
Type
&
)
{});
}
};
template
<
typename
comp
Type
,
ReduceTensorOp
_t
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
compType
&
,
comp
Type
)
>
ReduceOpFn
()
template
<
typename
AccData
Type
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccDataType
&
,
AccData
Type
)
>
ReduceOpFn
()
{
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
ADD
||
ReduceOpId
==
ReduceTensorOp
_t
::
AVG
||
ReduceOpId
==
ReduceTensorOp
_t
::
NORM1
||
ReduceOpId
==
ReduceTensorOp
_t
::
NORM2
)
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
ADD
||
ReduceOpId
==
ReduceTensorOp
::
AVG
||
ReduceOpId
==
ReduceTensorOp
::
NORM1
||
ReduceOpId
==
ReduceTensorOp
::
NORM2
)
{
return
([
&
](
comp
Type
&
a_
,
comp
Type
b_
)
{
a_
=
a_
+
b_
;
});
return
([
&
](
AccData
Type
&
a_
,
AccData
Type
b_
)
{
a_
=
a_
+
b_
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
MUL
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MUL
)
{
return
([
&
](
comp
Type
&
a_
,
comp
Type
b_
)
{
a_
=
a_
*
b_
;
});
return
([
&
](
AccData
Type
&
a_
,
AccData
Type
b_
)
{
a_
=
a_
*
b_
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
MIN
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MIN
)
{
return
([
&
](
comp
Type
&
a_
,
comp
Type
b_
)
{
return
([
&
](
AccData
Type
&
a_
,
AccData
Type
b_
)
{
if
(
a_
>
b_
)
a_
=
b_
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
MAX
||
ReduceOpId
==
ReduceTensorOp
_t
::
AMAX
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
)
{
return
([
&
](
comp
Type
&
a_
,
comp
Type
b_
)
{
return
([
&
](
AccData
Type
&
a_
,
AccData
Type
b_
)
{
if
(
a_
<
b_
)
a_
=
b_
;
});
}
};
template
<
typename
comp
Type
,
ReduceTensorOp
_t
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
compType
&
,
comp
Type
,
bool
&
changed
)
>
ReduceOpFn2
()
template
<
typename
AccData
Type
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccDataType
&
,
AccData
Type
,
bool
&
changed
)
>
ReduceOpFn2
()
{
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
MIN
)
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MIN
)
{
return
([
&
](
comp
Type
&
a_
,
comp
Type
b_
,
bool
&
changed
)
{
return
([
&
](
AccData
Type
&
a_
,
AccData
Type
b_
,
bool
&
changed
)
{
if
(
a_
>
b_
)
{
a_
=
b_
;
...
...
@@ -164,9 +164,9 @@ __host__ static inline std::function<void(compType&, compType, bool& changed)> R
changed
=
false
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
MAX
||
ReduceOpId
==
ReduceTensorOp
_t
::
AMAX
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
)
{
return
([
&
](
comp
Type
&
a_
,
comp
Type
b_
,
bool
&
changed
)
{
return
([
&
](
AccData
Type
&
a_
,
AccData
Type
b_
,
bool
&
changed
)
{
if
(
a_
<
b_
)
{
a_
=
b_
;
...
...
@@ -178,48 +178,49 @@ __host__ static inline std::function<void(compType&, compType, bool& changed)> R
}
else
{
// ReduceTensorOp
_t
::ADD:
// ReduceTensorOp
_t
::MUL:
// ReduceTensorOp
_t
::AVG:
// ReduceTensorOp
_t
::NORM1:
// ReduceTensorOp
_t
::NORM2:
return
(
std
::
function
<
void
(
compType
&
,
comp
Type
,
bool
&
)
>
{});
// ReduceTensorOp::ADD:
// ReduceTensorOp::MUL:
// ReduceTensorOp::AVG:
// ReduceTensorOp::NORM1:
// ReduceTensorOp::NORM2:
return
(
std
::
function
<
void
(
AccDataType
&
,
AccData
Type
,
bool
&
)
>
{});
};
};
template
<
typename
comp
Type
,
ReduceTensorOp
_t
ReduceOpId
>
__host__
static
inline
comp
Type
ReduceOpZeroVal
()
template
<
typename
AccData
Type
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
AccData
Type
ReduceOpZeroVal
()
{
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
MUL
)
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MUL
)
{
return
(
static_cast
<
comp
Type
>
(
1.0
f
));
return
(
static_cast
<
AccData
Type
>
(
1.0
f
));
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
MIN
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MIN
)
{
return
(
std
::
numeric_limits
<
comp
Type
>::
max
());
return
(
std
::
numeric_limits
<
AccData
Type
>::
max
());
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
MAX
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MAX
)
{
return
(
std
::
numeric_limits
<
comp
Type
>::
lowest
());
return
(
std
::
numeric_limits
<
AccData
Type
>::
lowest
());
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
AMAX
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
AMAX
)
{
return
(
static_cast
<
comp
Type
>
(
0.0
f
));
return
(
static_cast
<
AccData
Type
>
(
0.0
f
));
}
else
{
// ReduceTensorOp
_t
::ADD
// ReduceTensorOp
_t
::AVG
// ReduceTensorOp
_t
::NORM1
// ReduceTensorOp
_t
::NORM2
return
(
static_cast
<
comp
Type
>
(
0.0
f
));
// ReduceTensorOp::ADD
// ReduceTensorOp::AVG
// ReduceTensorOp::NORM1
// ReduceTensorOp::NORM2
return
(
static_cast
<
AccData
Type
>
(
0.0
f
));
};
};
template
<
typename
compType
,
bool
PropagateNan
>
__host__
static
inline
void
binop_with_nan_check
(
std
::
function
<
void
(
compType
&
,
compType
)
>
opReduce
,
compType
&
accuVal
,
compType
currVal
)
template
<
typename
AccDataType
,
bool
PropagateNan
>
__host__
static
inline
void
binop_with_nan_check
(
std
::
function
<
void
(
AccDataType
&
,
AccDataType
)
>
opReduce
,
AccDataType
&
accuVal
,
AccDataType
currVal
)
{
using
std
::
isnan
;
...
...
@@ -236,11 +237,11 @@ __host__ static inline void binop_with_nan_check(std::function<void(compType&, c
};
};
template
<
typename
comp
Type
,
bool
PropagateNan
>
template
<
typename
AccData
Type
,
bool
PropagateNan
>
__host__
static
inline
void
binop_with_nan_check2
(
std
::
function
<
void
(
compType
&
,
comp
Type
,
bool
&
)
>
opReduce
,
comp
Type
&
accuVal
,
comp
Type
currVal
,
binop_with_nan_check2
(
std
::
function
<
void
(
AccDataType
&
,
AccData
Type
,
bool
&
)
>
opReduce
,
AccData
Type
&
accuVal
,
AccData
Type
currVal
,
int
&
accuIndex
,
int
currIndex
)
{
...
...
library/include/ck/library/host_tensor/host_reduction.hpp
0 → 100644
View file @
dd6a8de4
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef HOST_REDUCTION_HPP_
#define HOST_REDUCTION_HPP_
#include <vector>
#include <array>
#include <functional>
#include "reduction_enums.hpp"
#include "host_reduce_util.hpp"
#include "host_tensor.hpp"
#include "data_type.hpp"
template
<
int
NDim
>
static
void
get_all_indexes
(
const
std
::
array
<
size_t
,
NDim
>&
dimLengths
,
std
::
vector
<
std
::
array
<
size_t
,
NDim
>>&
indexes
)
{
static_assert
(
NDim
>=
1
,
"NDim >= 1 is required to use this function!"
);
if
constexpr
(
NDim
==
1
)
{
for
(
size_t
i
=
0
;
i
<
dimLengths
[
0
];
i
++
)
{
std
::
array
<
size_t
,
1
>
index
{
i
};
indexes
.
push_back
(
index
);
};
}
else
{
std
::
array
<
size_t
,
NDim
-
1
>
partial_dim_lengths
;
for
(
int
i
=
0
;
i
<
NDim
-
1
;
i
++
)
partial_dim_lengths
[
i
]
=
dimLengths
[
i
+
1
];
std
::
vector
<
std
::
array
<
size_t
,
NDim
-
1
>>
partial_indexes
;
get_all_indexes
<
NDim
-
1
>
(
partial_dim_lengths
,
partial_indexes
);
for
(
size_t
i
=
0
;
i
<
dimLengths
[
0
];
i
++
)
for
(
const
auto
&
index
:
partial_indexes
)
{
std
::
array
<
size_t
,
NDim
>
extIndex
;
extIndex
[
0
]
=
i
;
for
(
int
k
=
0
;
k
<
NDim
-
1
;
k
++
)
extIndex
[
k
+
1
]
=
index
[
k
];
indexes
.
push_back
(
extIndex
);
};
};
};
template
<
int
NDim
>
static
size_t
get_offset_from_index
(
const
std
::
array
<
size_t
,
NDim
>&
strides
,
const
std
::
array
<
size_t
,
NDim
>&
index
)
{
size_t
offset
=
0
;
for
(
int
i
=
0
;
i
<
NDim
;
i
++
)
offset
+=
strides
[
i
]
*
index
[
i
];
return
(
offset
);
};
template
<
int
NDim
>
static
size_t
get_offset_from_index
(
const
std
::
vector
<
size_t
>&
strides
,
const
std
::
array
<
size_t
,
NDim
>&
index
)
{
size_t
offset
=
0
;
for
(
int
i
=
0
;
i
<
NDim
;
i
++
)
offset
+=
strides
[
i
]
*
index
[
i
];
return
(
offset
);
};
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
ck
::
ReduceTensorOp
ReduceOpId
,
int
Rank
,
int
NumReduceDim
,
bool
PropagateNan
,
bool
NeedIndices
>
struct
ReductionHost
{
using
IndexDataType
=
int32_t
;
static
constexpr
int
NumInvariantDim
=
Rank
-
NumReduceDim
;
std
::
vector
<
size_t
>
outStrides
;
std
::
vector
<
int
>
invariantDims
;
std
::
vector
<
int
>
reduceDims
;
IndexDataType
divider
;
std
::
function
<
void
(
AccDataType
&
)
>
preUnaryOp
;
std
::
function
<
void
(
AccDataType
&
)
>
posUnaryOp
;
std
::
array
<
size_t
,
NumReduceDim
>
reduceLengths
;
std
::
array
<
size_t
,
NumReduceDim
>
reduceStrides
;
std
::
array
<
size_t
,
NumInvariantDim
>
invariantLengths
;
std
::
array
<
size_t
,
NumInvariantDim
>
invariantStrides
;
std
::
vector
<
std
::
array
<
size_t
,
NumReduceDim
>>
reduce_dim_indexes
;
std
::
vector
<
std
::
array
<
size_t
,
NumInvariantDim
>>
invariant_dim_indexes
;
ReductionHost
(
HostTensorDescriptor
&
inDesc
,
HostTensorDescriptor
&
outDesc
,
const
std
::
vector
<
int
>&
invariantDims_
,
const
std
::
vector
<
int
>&
reduceDims_
)
{
using
ck
::
host_reduce
::
PosUnaryOpFn
;
using
ck
::
host_reduce
::
PreUnaryOpFn
;
// this->outLengths = to_int_vector(outDesc.GetLengths());
this
->
outStrides
=
outDesc
.
GetStrides
();
this
->
invariantDims
=
invariantDims_
;
this
->
reduceDims
=
reduceDims_
;
int
product
=
1
;
for
(
int
i
=
0
;
i
<
NumReduceDim
;
i
++
)
{
reduceLengths
[
i
]
=
inDesc
.
GetLengths
()[
reduceDims
[
i
]];
reduceStrides
[
i
]
=
inDesc
.
GetStrides
()[
reduceDims
[
i
]];
product
*=
inDesc
.
GetLengths
()[
reduceDims
[
i
]];
};
divider
=
product
;
for
(
int
i
=
0
;
i
<
NumInvariantDim
;
i
++
)
{
invariantLengths
[
i
]
=
inDesc
.
GetLengths
()[
invariantDims
[
i
]];
invariantStrides
[
i
]
=
inDesc
.
GetStrides
()[
invariantDims
[
i
]];
};
reduce_dim_indexes
.
clear
();
get_all_indexes
<
NumReduceDim
>
(
reduceLengths
,
reduce_dim_indexes
);
if
constexpr
(
NumInvariantDim
>
0
)
{
invariant_dim_indexes
.
clear
();
get_all_indexes
<
NumInvariantDim
>
(
invariantLengths
,
invariant_dim_indexes
);
};
preUnaryOp
=
PreUnaryOpFn
<
AccDataType
,
ReduceOpId
>
(
divider
);
posUnaryOp
=
PosUnaryOpFn
<
AccDataType
,
ReduceOpId
>
(
divider
);
};
void
Run
(
float
alpha
,
const
InDataType
*
in_data
,
float
beta
,
OutDataType
*
out_data
,
IndexDataType
*
out_indices
)
{
if
constexpr
(
NeedIndices
)
{
RunImpl_with_index
(
alpha
,
in_data
,
beta
,
out_data
,
out_indices
);
}
else
{
RunImpl_no_index
(
alpha
,
in_data
,
beta
,
out_data
);
};
};
void
RunImpl_with_index
(
float
alpha
,
const
InDataType
*
in_data
,
float
beta
,
OutDataType
*
out_data
,
IndexDataType
*
out_indices
)
{
using
ck
::
type_convert
;
using
ck
::
host_reduce
::
binop_with_nan_check2
;
using
ck
::
host_reduce
::
float_equal_one
;
using
ck
::
host_reduce
::
float_equal_zero
;
using
ck
::
host_reduce
::
ReduceOpFn2
;
using
ck
::
host_reduce
::
ReduceOpZeroVal
;
auto
opReduce2
=
ReduceOpFn2
<
AccDataType
,
ReduceOpId
>
();
if
constexpr
(
NumInvariantDim
==
0
)
{
AccDataType
accuVal
=
ReduceOpZeroVal
<
AccDataType
,
ReduceOpId
>
();
IndexDataType
accuIndex
=
0
;
for
(
IndexDataType
i
=
0
;
i
<
reduce_dim_indexes
.
size
();
i
++
)
{
auto
offset_reduce
=
get_offset_from_index
<
NumReduceDim
>
(
reduceStrides
,
reduce_dim_indexes
[
i
]);
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_reduce
]);
preUnaryOp
(
currVal
);
auto
currIndex
=
i
;
binop_with_nan_check2
<
AccDataType
,
PropagateNan
>
(
opReduce2
,
accuVal
,
currVal
,
accuIndex
,
currIndex
);
};
posUnaryOp
(
accuVal
);
if
(
!
float_equal_one
(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
if
(
!
float_equal_zero
(
beta
))
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
0
])
*
type_convert
<
AccDataType
>
(
beta
);
out_data
[
0
]
=
type_convert
<
OutDataType
>
(
accuVal
);
out_indices
[
0
]
=
accuIndex
;
}
else
{
auto
thread_reduce_func
=
[
&
](
auto
invariant_index
)
{
AccDataType
accuVal
=
ReduceOpZeroVal
<
AccDataType
,
ReduceOpId
>
();
IndexDataType
accuIndex
=
0
;
auto
offset_invariant
=
get_offset_from_index
<
NumInvariantDim
>
(
invariantStrides
,
invariant_index
);
for
(
IndexDataType
i
=
0
;
i
<
reduce_dim_indexes
.
size
();
i
++
)
{
auto
offset_reduce
=
get_offset_from_index
<
NumReduceDim
>
(
reduceStrides
,
reduce_dim_indexes
[
i
]);
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_invariant
+
offset_reduce
]);
preUnaryOp
(
currVal
);
auto
currIndex
=
i
;
binop_with_nan_check2
<
AccDataType
,
PropagateNan
>
(
opReduce2
,
accuVal
,
currVal
,
accuIndex
,
currIndex
);
};
posUnaryOp
(
accuVal
);
if
(
!
float_equal_one
(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
auto
dst_offset
=
get_offset_from_index
<
NumInvariantDim
>
(
outStrides
,
invariant_index
);
if
(
!
float_equal_zero
(
beta
))
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
dst_offset
])
*
type_convert
<
AccDataType
>
(
beta
);
out_data
[
dst_offset
]
=
type_convert
<
OutDataType
>
(
accuVal
);
out_indices
[
dst_offset
]
=
accuIndex
;
};
std
::
size_t
num_thread
=
1
;
std
::
size_t
work_per_thread
=
(
invariant_dim_indexes
.
size
()
+
num_thread
-
1
)
/
num_thread
;
std
::
vector
<
joinable_thread
>
threads
(
num_thread
);
for
(
std
::
size_t
it
=
0
;
it
<
num_thread
;
++
it
)
{
std
::
size_t
iw_begin
=
it
*
work_per_thread
;
std
::
size_t
iw_end
=
std
::
min
((
it
+
1
)
*
work_per_thread
,
invariant_dim_indexes
.
size
());
auto
f
=
[
=
]
{
for
(
std
::
size_t
iw
=
iw_begin
;
iw
<
iw_end
;
++
iw
)
{
thread_reduce_func
(
invariant_dim_indexes
[
iw
]);
}
};
threads
[
it
]
=
joinable_thread
(
f
);
}
};
};
void
RunImpl_no_index
(
float
alpha
,
const
InDataType
*
in_data
,
float
beta
,
OutDataType
*
out_data
)
{
using
ck
::
type_convert
;
using
ck
::
host_reduce
::
binop_with_nan_check
;
using
ck
::
host_reduce
::
float_equal_one
;
using
ck
::
host_reduce
::
float_equal_zero
;
using
ck
::
host_reduce
::
ReduceOpFn
;
using
ck
::
host_reduce
::
ReduceOpZeroVal
;
auto
opReduce
=
ReduceOpFn
<
AccDataType
,
ReduceOpId
>
();
if
constexpr
(
NumInvariantDim
==
0
)
{
AccDataType
accuVal
=
ReduceOpZeroVal
<
AccDataType
,
ReduceOpId
>
();
for
(
const
auto
&
reduce_index
:
reduce_dim_indexes
)
{
auto
offset_reduce
=
get_offset_from_index
<
NumReduceDim
>
(
reduceStrides
,
reduce_index
);
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_reduce
]);
preUnaryOp
(
currVal
);
binop_with_nan_check
<
AccDataType
,
PropagateNan
>
(
opReduce
,
accuVal
,
currVal
);
};
posUnaryOp
(
accuVal
);
if
(
!
float_equal_one
(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
if
(
!
float_equal_zero
(
beta
))
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
0
])
*
type_convert
<
AccDataType
>
(
beta
);
out_data
[
0
]
=
type_convert
<
OutDataType
>
(
accuVal
);
}
else
{
auto
thread_reduce_func
=
[
&
](
auto
invariant_index
)
{
AccDataType
accuVal
=
ReduceOpZeroVal
<
AccDataType
,
ReduceOpId
>
();
auto
offset_invariant
=
get_offset_from_index
<
NumInvariantDim
>
(
invariantStrides
,
invariant_index
);
for
(
const
auto
&
reduce_index
:
reduce_dim_indexes
)
{
auto
offset_reduce
=
get_offset_from_index
<
NumReduceDim
>
(
reduceStrides
,
reduce_index
);
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_invariant
+
offset_reduce
]);
preUnaryOp
(
currVal
);
binop_with_nan_check
<
AccDataType
,
PropagateNan
>
(
opReduce
,
accuVal
,
currVal
);
};
posUnaryOp
(
accuVal
);
if
(
!
float_equal_one
(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
auto
dst_offset
=
get_offset_from_index
<
NumInvariantDim
>
(
outStrides
,
invariant_index
);
if
(
!
float_equal_zero
(
beta
))
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
dst_offset
])
*
type_convert
<
AccDataType
>
(
beta
);
out_data
[
dst_offset
]
=
type_convert
<
OutDataType
>
(
accuVal
);
};
std
::
size_t
num_thread
=
1
;
std
::
size_t
work_per_thread
=
(
invariant_dim_indexes
.
size
()
+
num_thread
-
1
)
/
num_thread
;
std
::
vector
<
joinable_thread
>
threads
(
num_thread
);
for
(
std
::
size_t
it
=
0
;
it
<
num_thread
;
++
it
)
{
std
::
size_t
iw_begin
=
it
*
work_per_thread
;
std
::
size_t
iw_end
=
std
::
min
((
it
+
1
)
*
work_per_thread
,
invariant_dim_indexes
.
size
());
auto
f
=
[
=
]
{
for
(
std
::
size_t
iw
=
iw_begin
;
iw
<
iw_end
;
++
iw
)
{
thread_reduce_func
(
invariant_dim_indexes
[
iw
]);
}
};
threads
[
it
]
=
joinable_thread
(
f
);
}
};
};
};
#endif
library/include/ck/library/host_tensor/host_tensor.hpp
View file @
dd6a8de4
...
...
@@ -40,20 +40,6 @@ std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
return
os
;
}
typedef
enum
{
Half
=
0
,
Float
=
1
,
}
DataType_t
;
template
<
typename
T
>
struct
DataType
;
template
<
>
struct
DataType
<
float
>
:
std
::
integral_constant
<
DataType_t
,
DataType_t
::
Float
>
{
};
template
<
typename
F
,
typename
T
,
std
::
size_t
...
Is
>
auto
call_f_unpack_args_impl
(
F
f
,
T
args
,
std
::
index_sequence
<
Is
...
>
)
{
...
...
@@ -87,10 +73,10 @@ struct HostTensorDescriptor
HostTensorDescriptor
()
=
delete
;
template
<
typename
X
>
HostTensorDescriptor
(
std
::
vector
<
X
>
lens
);
HostTensorDescriptor
(
const
std
::
vector
<
X
>
&
lens
);
template
<
typename
X
,
typename
Y
>
HostTensorDescriptor
(
std
::
vector
<
X
>
lens
,
std
::
vector
<
Y
>
strides
);
HostTensorDescriptor
(
const
std
::
vector
<
X
>
&
lens
,
const
std
::
vector
<
Y
>
&
strides
);
void
CalculateStrides
();
...
...
@@ -177,7 +163,7 @@ struct ParallelTensorFunctor
return
indices
;
}
void
operator
()(
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
)
const
void
operator
()(
std
::
size_t
num_thread
=
1
)
const
{
std
::
size_t
work_per_thread
=
(
mN1d
+
num_thread
-
1
)
/
num_thread
;
...
...
@@ -227,7 +213,7 @@ struct Tensor
Tensor
(
const
HostTensorDescriptor
&
desc
)
:
mDesc
(
desc
),
mData
(
mDesc
.
GetElementSpace
())
{}
template
<
typename
G
>
void
GenerateTensorValue
(
G
g
,
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
)
void
GenerateTensorValue
(
G
g
,
std
::
size_t
num_thread
=
1
)
{
switch
(
mDesc
.
GetNumOfDimension
())
{
...
...
@@ -299,85 +285,69 @@ struct Tensor
};
template
<
typename
X
>
HostTensorDescriptor
::
HostTensorDescriptor
(
std
::
vector
<
X
>
lens
)
:
mLens
(
lens
)
HostTensorDescriptor
::
HostTensorDescriptor
(
const
std
::
vector
<
X
>
&
lens
)
:
mLens
(
lens
)
{
this
->
CalculateStrides
();
}
template
<
typename
X
,
typename
Y
>
HostTensorDescriptor
::
HostTensorDescriptor
(
std
::
vector
<
X
>
lens
,
std
::
vector
<
Y
>
strides
)
HostTensorDescriptor
::
HostTensorDescriptor
(
const
std
::
vector
<
X
>&
lens
,
const
std
::
vector
<
Y
>&
strides
)
:
mLens
(
lens
),
mStrides
(
strides
)
{
}
void
ostream_HostTensorDescriptor
(
const
HostTensorDescriptor
&
desc
,
std
::
ostream
&
os
=
std
::
cout
);
float
bf16_to_f32_
(
ck
::
bhalf_t
src_val
);
#if 1
// FIXME: remove
void
bf16_to_f32_
(
const
Tensor
<
ck
::
bhalf_t
>&
src
,
Tensor
<
float
>&
dst
);
#endif
template
<
typename
T
>
void
check_error
(
const
Tensor
<
T
>&
ref
,
const
Tensor
<
T
>&
result
)
float
check_error
(
const
Tensor
<
T
>&
ref
,
const
Tensor
<
T
>&
result
)
{
float
error
=
0
;
float
max_diff
=
-
1
;
float
ref_value
=
0
,
result_value
=
0
;
float
l1_error
=
0
;
float
linf_error
=
-
1
;
float
linf_rel_error
=
-
1
;
float
linf_ref_value
=
0
,
linf_result_value
=
0
;
float
linf_rel_ref_value
=
0
,
linf_rel_result_value
=
0
;
constexpr
float
eps
=
1e-10
;
if
constexpr
(
std
::
is_same
<
ck
::
bhalf_t
,
T
>::
value
)
for
(
int
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
{
for
(
int
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
float
ref_v
=
ck
::
type_convert
<
float
>
(
ref
.
mData
[
i
]);
float
result_v
=
ck
::
type_convert
<
float
>
(
result
.
mData
[
i
]);
float
diff
=
std
::
abs
(
ref_v
-
result_v
);
float
rel_diff
=
diff
/
std
::
max
(
std
::
abs
(
ref_v
),
eps
);
l1_error
+=
diff
;
if
(
linf_error
<
diff
)
{
error
+=
std
::
abs
(
bf16_to_f32_
(
ref
.
mData
[
i
])
-
bf16_to_f32_
(
result
.
mData
[
i
]));
float
diff
=
std
::
abs
(
bf16_to_f32_
(
ref
.
mData
[
i
])
-
bf16_to_f32_
(
result
.
mData
[
i
]));
if
(
max_diff
<
diff
)
{
max_diff
=
diff
;
ref_value
=
bf16_to_f32_
(
ref
.
mData
[
i
]);
result_value
=
bf16_to_f32_
(
result
.
mData
[
i
]);
}
linf_error
=
diff
;
linf_ref_value
=
ref_v
;
linf_result_value
=
result_v
;
}
}
else
{
for
(
int
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
if
(
linf_rel_error
<
rel_diff
)
{
error
+=
std
::
abs
(
double
(
ref
.
mData
[
i
])
-
double
(
result
.
mData
[
i
]));
float
diff
=
std
::
abs
(
double
(
ref
.
mData
[
i
])
-
double
(
result
.
mData
[
i
]));
if
(
max_diff
<
diff
)
{
max_diff
=
diff
;
ref_value
=
ref
.
mData
[
i
];
result_value
=
result
.
mData
[
i
];
}
linf_rel_error
=
rel_diff
;
linf_rel_ref_value
=
ref_v
;
linf_rel_result_value
=
result_v
;
}
}
std
::
cout
<<
"error: "
<<
error
<<
std
::
endl
;
std
::
cout
<<
"max_diff: "
<<
max_diff
<<
", "
<<
ref_value
<<
", "
<<
result_value
<<
std
::
endl
;
}
template
<
typename
T
>
void
check_indices
(
const
Tensor
<
T
>&
ref
,
const
Tensor
<
T
>&
result
)
{
bool
has_error
=
false
;
int
error_count
=
0
;
for
(
int
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
{
if
(
ref
.
mData
[
i
]
!=
result
.
mData
[
i
])
{
std
::
cerr
<<
std
::
endl
<<
"Indices different at position "
<<
i
<<
" (ref: "
<<
ref
.
mData
[
i
]
<<
", result: "
<<
result
.
mData
[
i
]
<<
")"
<<
std
::
endl
;
has_error
=
true
;
error_count
++
;
if
(
error_count
==
20
)
break
;
};
}
std
::
cout
<<
"Absolute Error L1 Norm (sum of abs diff): "
<<
l1_error
<<
std
::
endl
;
std
::
cout
<<
"Absolute Error L-inf Norm (max abs diff): "
<<
linf_error
<<
", ref "
<<
linf_ref_value
<<
", result "
<<
linf_result_value
<<
std
::
endl
;
std
::
cout
<<
"Relative Error L-inf Norm (max relative abs diff): "
<<
linf_rel_error
<<
", ref "
<<
linf_rel_ref_value
<<
", result "
<<
linf_rel_result_value
<<
std
::
endl
;
if
(
!
has_error
)
std
::
cout
<<
std
::
endl
<<
"Indices result is completely acccurate!"
<<
std
::
endl
;
return
linf_error
;
}
#endif
library/include/ck/library/host_tensor/host_tensor_generator.hpp
View file @
dd6a8de4
#ifndef HOST_TENSOR_GENERATOR_HPP
#define HOST_TENSOR_GENERATOR_HPP
#pragma once
#include <cmath>
#include <numeric>
#include "config.hpp"
template
<
typename
T
>
...
...
@@ -93,8 +94,8 @@ struct GeneratorTensor_2<int8_t>
template
<
typename
T
>
struct
GeneratorTensor_3
{
T
min_value
=
0
;
T
max_value
=
1
;
float
min_value
=
0
;
float
max_value
=
1
;
template
<
typename
...
Is
>
T
operator
()(
Is
...)
...
...
@@ -122,22 +123,6 @@ struct GeneratorTensor_3<ck::bhalf_t>
}
};
template
<
>
struct
GeneratorTensor_3
<
int8_t
>
{
float
min_value
=
0
;
float
max_value
=
1
;
template
<
typename
...
Is
>
int8_t
operator
()(
Is
...)
{
int8_t
min_tmp
=
static_cast
<
int8_t
>
(
min_value
);
int8_t
max_tmp
=
static_cast
<
int8_t
>
(
max_value
);
return
(
std
::
rand
()
%
(
max_tmp
-
min_tmp
))
+
min_tmp
;
}
};
struct
GeneratorTensor_Checkboard
{
template
<
typename
...
Ts
>
...
...
@@ -163,5 +148,3 @@ struct GeneratorTensor_Sequential
return
dims
[
Dim
];
}
};
#endif
Prev
1
…
4
5
6
7
8
9
10
11
12
…
24
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment