Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
07a673c6
"research/adversarial_crypto/train_eval.py" did not exist on "76cf35de36b3d5a10255c54b2e54500048c79b47"
Commit
07a673c6
authored
Apr 14, 2022
by
carlushuang
Browse files
Merge remote-tracking branch 'origin/develop' into cpu_avx2
parents
c0f698d5
ac0d8066
Changes
307
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
231 additions
and
275 deletions
+231
-275
include/ck/utility/data_type_enum.hpp
include/ck/utility/data_type_enum.hpp
+1
-1
include/ck/utility/data_type_enum_helper.hpp
include/ck/utility/data_type_enum_helper.hpp
+11
-11
include/ck/utility/dynamic_buffer.hpp
include/ck/utility/dynamic_buffer.hpp
+152
-158
include/ck/utility/get_id.hpp
include/ck/utility/get_id.hpp
+1
-5
include/ck/utility/multi_index.hpp
include/ck/utility/multi_index.hpp
+1
-1
include/ck/utility/reduction_enums.hpp
include/ck/utility/reduction_enums.hpp
+4
-4
include/ck/utility/static_buffer.hpp
include/ck/utility/static_buffer.hpp
+5
-11
include/ck/utility/synchronization.hpp
include/ck/utility/synchronization.hpp
+1
-1
library/include/ck/library/host_tensor/conv_common.hpp
library/include/ck/library/host_tensor/conv_common.hpp
+4
-4
library/include/ck/library/host_tensor/device_tensor.hpp
library/include/ck/library/host_tensor/device_tensor.hpp
+0
-1
library/include/ck/library/host_tensor/host_reduce_util.hpp
library/include/ck/library/host_tensor/host_reduce_util.hpp
+43
-43
library/include/ck/library/host_tensor/host_reduction.hpp
library/include/ck/library/host_tensor/host_reduction.hpp
+1
-1
library/include/ck/library/host_tensor/host_tensor.hpp
library/include/ck/library/host_tensor/host_tensor.hpp
+0
-27
library/include/ck/library/obselete_driver_offline/device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
...ward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+1
-1
library/include/ck/library/obselete_driver_offline/device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
...ackward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
+1
-1
library/include/ck/library/obselete_driver_offline/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
...kward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
+1
-1
library/include/ck/library/obselete_driver_offline/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1.hpp
...d_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1.hpp
+1
-1
library/include/ck/library/obselete_driver_offline/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_atomic_nchw_kcyx_nkhw.hpp
...ght_implicit_gemm_v4r4r2_xdlops_atomic_nchw_kcyx_nkhw.hpp
+1
-1
library/include/ck/library/obselete_driver_offline/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
...ard_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
+1
-1
library/include/ck/library/obselete_driver_offline/device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_atomic_nhwc_kyxc_nhwk.hpp
...ght_implicit_gemm_v4r4r4_xdlops_atomic_nhwc_kyxc_nhwk.hpp
+1
-1
No files found.
include/ck/utility/data_type_enum.hpp
View file @
07a673c6
...
...
@@ -3,7 +3,7 @@
namespace
ck
{
enum
struct
DataTypeEnum
_t
enum
struct
DataTypeEnum
{
Half
=
0
,
Float
=
1
,
...
...
include/ck/utility/data_type_enum_helper.hpp
View file @
07a673c6
...
...
@@ -6,35 +6,35 @@
namespace
ck
{
template
<
DataTypeEnum
_t
DataTypeEnum
>
template
<
DataTypeEnum
DataTypeEnum
>
struct
get_datatype_from_enum
;
template
<
>
struct
get_datatype_from_enum
<
DataTypeEnum
_t
::
Int8
>
struct
get_datatype_from_enum
<
DataTypeEnum
::
Int8
>
{
using
type
=
int8_t
;
};
template
<
>
struct
get_datatype_from_enum
<
DataTypeEnum
_t
::
Int32
>
struct
get_datatype_from_enum
<
DataTypeEnum
::
Int32
>
{
using
type
=
int32_t
;
};
template
<
>
struct
get_datatype_from_enum
<
DataTypeEnum
_t
::
Half
>
struct
get_datatype_from_enum
<
DataTypeEnum
::
Half
>
{
using
type
=
half_t
;
};
template
<
>
struct
get_datatype_from_enum
<
DataTypeEnum
_t
::
Float
>
struct
get_datatype_from_enum
<
DataTypeEnum
::
Float
>
{
using
type
=
float
;
};
template
<
>
struct
get_datatype_from_enum
<
DataTypeEnum
_t
::
Double
>
struct
get_datatype_from_enum
<
DataTypeEnum
::
Double
>
{
using
type
=
double
;
};
...
...
@@ -45,31 +45,31 @@ struct get_datatype_enum_from_type;
template
<
>
struct
get_datatype_enum_from_type
<
int8_t
>
{
static
constexpr
DataTypeEnum
_t
value
=
DataTypeEnum
_t
::
Int8
;
static
constexpr
DataTypeEnum
value
=
DataTypeEnum
::
Int8
;
};
template
<
>
struct
get_datatype_enum_from_type
<
int32_t
>
{
static
constexpr
DataTypeEnum
_t
value
=
DataTypeEnum
_t
::
Int32
;
static
constexpr
DataTypeEnum
value
=
DataTypeEnum
::
Int32
;
};
template
<
>
struct
get_datatype_enum_from_type
<
half_t
>
{
static
constexpr
DataTypeEnum
_t
value
=
DataTypeEnum
_t
::
Half
;
static
constexpr
DataTypeEnum
value
=
DataTypeEnum
::
Half
;
};
template
<
>
struct
get_datatype_enum_from_type
<
float
>
{
static
constexpr
DataTypeEnum
_t
value
=
DataTypeEnum
_t
::
Float
;
static
constexpr
DataTypeEnum
value
=
DataTypeEnum
::
Float
;
};
template
<
>
struct
get_datatype_enum_from_type
<
double
>
{
static
constexpr
DataTypeEnum
_t
value
=
DataTypeEnum
_t
::
Double
;
static
constexpr
DataTypeEnum
value
=
DataTypeEnum
::
Double
;
};
}
// namespace ck
...
...
include/ck/utility/dynamic_buffer.hpp
View file @
07a673c6
#ifndef CK_BUFFER_HPP
#define CK_BUFFER_HPP
#pragma once
#include "amd_buffer_addressing.hpp"
#include "c_style_pointer_cast.hpp"
#include "config.hpp"
...
...
@@ -8,7 +6,7 @@
namespace
ck
{
template
<
AddressSpaceEnum
_t
BufferAddressSpace
,
template
<
AddressSpaceEnum
BufferAddressSpace
,
typename
T
,
typename
ElementSpaceSize
,
bool
InvalidElementUseNumericalZeroValue
>
...
...
@@ -34,7 +32,7 @@ struct DynamicBuffer
{
}
__host__
__device__
static
constexpr
AddressSpaceEnum
_t
GetAddressSpace
()
__host__
__device__
static
constexpr
AddressSpaceEnum
GetAddressSpace
()
{
return
BufferAddressSpace
;
}
...
...
@@ -55,7 +53,7 @@ struct DynamicBuffer
constexpr
index_t
scalar_per_x_vector
=
scalar_type
<
remove_cvref_t
<
X
>>::
vector_size
;
static_assert
(
scalar_per_x_vector
%
scalar_per_t_vector
==
0
,
"wrong! X
need to be
multiple T"
);
"wrong! X
should contain
multiple T"
);
#if CK_USE_AMD_BUFFER_LOAD
bool
constexpr
use_amd_buffer_addressing
=
true
;
...
...
@@ -63,7 +61,7 @@ struct DynamicBuffer
bool
constexpr
use_amd_buffer_addressing
=
false
;
#endif
if
constexpr
(
GetAddressSpace
()
==
AddressSpaceEnum
_t
::
Global
&&
use_amd_buffer_addressing
)
if
constexpr
(
GetAddressSpace
()
==
AddressSpaceEnum
::
Global
&&
use_amd_buffer_addressing
)
{
constexpr
index_t
t_per_x
=
scalar_per_x_vector
/
scalar_per_t_vector
;
...
...
@@ -81,50 +79,48 @@ struct DynamicBuffer
}
else
{
if
constexpr
(
In
valid
E
lement
UseNumericalZeroValue
)
if
(
is_
valid
_e
lement
)
{
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
X
tmp
;
__builtin_memcpy
(
&
tmp
,
&
(
p_data_
[
i
]),
sizeof
(
X
));
return
is_valid_element
?
tmp
:
X
{
0
}
;
return
tmp
;
#else
return
is_valid_element
?
*
c_style_pointer_cast
<
const
X
*>
(
&
p_data_
[
i
])
:
X
{
0
}
;
return
*
c_style_pointer_cast
<
const
X
*>
(
&
p_data_
[
i
]);
#endif
}
else
{
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
X
tmp
;
__builtin_memcpy
(
&
tmp
,
&
(
p_data_
[
i
]),
sizeof
(
X
));
return
is_valid_element
?
tmp
:
X
{
invalid_element_value_
};
#else
return
is_valid_element
?
*
c_style_pointer_cast
<
const
X
*>
(
&
p_data_
[
i
])
:
X
{
invalid_element_value_
};
#endif
if
constexpr
(
InvalidElementUseNumericalZeroValue
)
{
return
X
{
0
};
}
else
{
return
X
{
invalid_element_value_
};
}
}
}
}
template
<
InMemoryDataOperationEnum
_t
Op
,
template
<
InMemoryDataOperationEnum
Op
,
typename
X
,
typename
enable_if
<
is_same
<
typename
scalar_type
<
remove_cvref_t
<
X
>
>::
type
,
typename
scalar_type
<
remove_cvref_t
<
T
>>::
type
>::
value
,
bool
>::
type
=
false
>
__host__
__device__
void
Update
(
index_t
i
,
bool
is_valid_element
,
const
X
&
x
)
{
if
constexpr
(
Op
==
InMemoryDataOperationEnum
_t
::
Set
)
if
constexpr
(
Op
==
InMemoryDataOperationEnum
::
Set
)
{
this
->
template
Set
<
X
>(
i
,
is_valid_element
,
x
);
}
else
if
constexpr
(
Op
==
InMemoryDataOperationEnum
_t
::
AtomicAdd
)
else
if
constexpr
(
Op
==
InMemoryDataOperationEnum
::
AtomicAdd
)
{
this
->
template
AtomicAdd
<
X
>(
i
,
is_valid_element
,
x
);
}
else
if
constexpr
(
Op
==
InMemoryDataOperationEnum
_t
::
Add
)
else
if
constexpr
(
Op
==
InMemoryDataOperationEnum
::
Add
)
{
auto
tmp
=
this
->
template
Get
<
X
>(
i
,
is_valid_element
);
this
->
template
Set
<
X
>(
i
,
is_valid_element
,
x
+
tmp
);
...
...
@@ -145,143 +141,120 @@ struct DynamicBuffer
constexpr
index_t
scalar_per_x_vector
=
scalar_type
<
remove_cvref_t
<
X
>>::
vector_size
;
static_assert
(
scalar_per_x_vector
%
scalar_per_t_vector
==
0
,
"wrong! X
need to be
multiple T"
);
"wrong! X
should contain
multiple T"
);
if
constexpr
(
GetAddressSpace
()
==
AddressSpaceEnum_t
::
Global
)
{
#if CK_USE_AMD_BUFFER_STORE
constexpr
index_t
t_per_x
=
scalar_per_x_vector
/
scalar_per_t_vector
;
amd_buffer_store
<
remove_cvref_t
<
T
>
,
t_per_x
>
(
x
,
p_data_
,
i
,
is_valid_element
,
element_space_size_
);
bool
constexpr
use_amd_buffer_addressing
=
true
;
#else
if
(
is_valid_element
)
{
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
X
tmp
=
x
;
bool
constexpr
use_amd_buffer_addressing
=
false
;
#endif
__builtin_memcpy
(
&
(
p_data_
[
i
]),
&
tmp
,
sizeof
(
X
));
#if CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
bool
constexpr
workaround_int8_ds_write_issue
=
true
;
#else
*
c_style_pointer_cast
<
X
*>
(
&
p_data_
[
i
])
=
x
;
#endif
}
bool
constexpr
workaround_int8_ds_write_issue
=
false
;
#endif
if
constexpr
(
GetAddressSpace
()
==
AddressSpaceEnum
::
Global
&&
use_amd_buffer_addressing
)
{
constexpr
index_t
t_per_x
=
scalar_per_x_vector
/
scalar_per_t_vector
;
amd_buffer_store
<
remove_cvref_t
<
T
>
,
t_per_x
>
(
x
,
p_data_
,
i
,
is_valid_element
,
element_space_size_
);
}
else
if
constexpr
(
GetAddressSpace
()
==
AddressSpaceEnum_t
::
Lds
)
else
if
constexpr
(
GetAddressSpace
()
==
AddressSpaceEnum
::
Lds
&&
is_same
<
typename
scalar_type
<
remove_cvref_t
<
T
>>::
type
,
int8_t
>::
value
&&
workaround_int8_ds_write_issue
)
{
if
(
is_valid_element
)
{
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
X
tmp
=
x
;
__builtin_memcpy
(
&
(
p_data_
[
i
]),
&
tmp
,
sizeof
(
X
));
#else
*
c_style_pointer_cast
<
X
*>
(
&
p_data_
[
i
])
=
x
;
#endif
#else
// HACK: compiler would lower IR "store<i8, 16> address_space(3)" into
// inefficient
// HACK: compiler would lower IR "store<i8, 16> address_space(3)" into inefficient
// ISA, so I try to let compiler emit IR "store<i32, 4>" which would be lower to
// ds_write_b128
// TODO: remove this after compiler fix
if
constexpr
(
is_same
<
typename
scalar_type
<
remove_cvref_t
<
T
>>::
type
,
int8_t
>::
value
)
static_assert
((
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x2_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x4_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x8_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x16_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8x4_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x4_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8x8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x8_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8x16_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x16_t
>::
value
),
"wrong! not implemented for this combination, please add "
"implementation"
);
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8_t
>::
value
)
{
static_assert
((
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x2_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x4_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x8_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x16_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8x4_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x4_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8x8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x8_t
>::
value
)
||
(
is_same
<
remove_cvref_t
<
T
>
,
int8x16_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x16_t
>::
value
),
"wrong! not implemented for this combination, please add "
"implementation"
);
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int8_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int8_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x2_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int16_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int16_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x4_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int32_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int32_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x8_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int32x2_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int32x2_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x16_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int32x4_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int32x4_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8x4_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x4_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int32_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int32_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8x8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x8_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int32x2_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int32x2_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8x16_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x16_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int32x4_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int32x4_t
*>
(
&
x
);
}
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int8_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int8_t
*>
(
&
x
);
}
else
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x2_t
>::
value
)
{
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
X
tmp
=
x
;
__builtin_memcpy
(
&
(
p_data_
[
i
]),
&
tmp
,
sizeof
(
X
));
#else
*
c_style_pointer_cast
<
X
*>
(
&
p_data_
[
i
])
=
x
;
#endif
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int16_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int16_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x4_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int32_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int32_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x8_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int32x2_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int32x2_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x16_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int32x4_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int32x4_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8x4_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x4_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int32_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int32_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8x8_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x8_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int32x2_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int32x2_t
*>
(
&
x
);
}
else
if
constexpr
(
is_same
<
remove_cvref_t
<
T
>
,
int8x16_t
>::
value
&&
is_same
<
remove_cvref_t
<
X
>
,
int8x16_t
>::
value
)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*
c_style_pointer_cast
<
int32x4_t
*>
(
&
p_data_
[
i
])
=
*
c_style_pointer_cast
<
const
int32x4_t
*>
(
&
x
);
}
#endif
}
}
else
...
...
@@ -305,27 +278,49 @@ struct DynamicBuffer
bool
>::
type
=
false
>
__host__
__device__
void
AtomicAdd
(
index_t
i
,
bool
is_valid_element
,
const
X
&
x
)
{
using
scalar_t
=
typename
scalar_type
<
remove_cvref_t
<
T
>>::
type
;
// X contains multiple T
constexpr
index_t
scalar_per_t_vector
=
scalar_type
<
remove_cvref_t
<
T
>>::
vector_size
;
constexpr
index_t
scalar_per_x_vector
=
scalar_type
<
remove_cvref_t
<
X
>>::
vector_size
;
static_assert
(
scalar_per_x_vector
%
scalar_per_t_vector
==
0
,
"wrong! X need to be multiple T"
);
static_assert
(
GetAddressSpace
()
==
AddressSpaceEnum_t
::
Global
,
"only support global mem"
);
"wrong! X should contain multiple T"
);
static_assert
(
GetAddressSpace
()
==
AddressSpaceEnum
::
Global
,
"only support global mem"
);
#if CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
bool
constexpr
use_amd_buffer_addressing
=
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
int32_t
>
||
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
float
>
||
(
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
half_t
>
&&
scalar_per_x_vector
%
2
==
0
);
#elif CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT)
bool
constexpr
use_amd_buffer_addressing
=
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
int32_t
>
;
#elif(!CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
bool
constexpr
use_amd_buffer_addressing
=
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
float
>
||
(
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
half_t
>
&&
scalar_per_x_vector
%
2
==
0
);
#else
bool
constexpr
use_amd_buffer_addressing
=
false
;
#endif
#if CK_USE_AMD_BUFFER_ATOMIC_ADD
constexpr
index_t
t_per_x
=
scalar_per_x_vector
/
scalar_per_t_vector
;
if
constexpr
(
use_amd_buffer_addressing
)
{
constexpr
index_t
t_per_x
=
scalar_per_x_vector
/
scalar_per_t_vector
;
amd_buffer_atomic_add
<
remove_cvref_t
<
T
>
,
t_per_x
>
(
x
,
p_data_
,
i
,
is_valid_element
,
element_space_size_
);
#else
if
(
is_valid_element
)
amd_buffer_atomic_add
<
remove_cvref_t
<
T
>
,
t_per_x
>
(
x
,
p_data_
,
i
,
is_valid_element
,
element_space_size_
);
}
else
{
atomicAdd
(
&
p_data_
[
i
],
x
);
if
(
is_valid_element
)
{
// FIXME: atomicAdd is defined by HIP, need to avoid implicit type casting when
// calling it
atomicAdd
(
c_style_pointer_cast
<
X
*>
(
&
p_data_
[
i
]),
x
);
}
}
#endif
}
__host__
__device__
static
constexpr
bool
IsStaticBuffer
()
{
return
false
;
}
...
...
@@ -333,14 +328,14 @@ struct DynamicBuffer
__host__
__device__
static
constexpr
bool
IsDynamicBuffer
()
{
return
true
;
}
};
template
<
AddressSpaceEnum
_t
BufferAddressSpace
,
typename
T
,
typename
ElementSpaceSize
>
template
<
AddressSpaceEnum
BufferAddressSpace
,
typename
T
,
typename
ElementSpaceSize
>
__host__
__device__
constexpr
auto
make_dynamic_buffer
(
T
*
p
,
ElementSpaceSize
element_space_size
)
{
return
DynamicBuffer
<
BufferAddressSpace
,
T
,
ElementSpaceSize
,
true
>
{
p
,
element_space_size
};
}
template
<
AddressSpaceEnum
_t
BufferAddressSpace
,
AddressSpaceEnum
BufferAddressSpace
,
typename
T
,
typename
ElementSpaceSize
,
typename
X
,
...
...
@@ -353,4 +348,3 @@ make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element
}
}
// namespace ck
#endif
include/ck/utility/
utility
.hpp
→
include/ck/utility/
get_id
.hpp
View file @
07a673c6
#ifndef CK_UTILITY_HPP
#define CK_UTILITY_HPP
#pragma once
#include "config.hpp"
namespace
ck
{
...
...
@@ -16,5 +14,3 @@ __device__ index_t get_block_1d_id() { return blockIdx.x; }
__device__
index_t
get_grid_size
()
{
return
gridDim
.
x
;
}
}
// namespace ck
#endif
include/ck/utility/multi_index.hpp
View file @
07a673c6
...
...
@@ -3,7 +3,7 @@
#include "common_header.hpp"
#if CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX
#if CK_
EXPERIMENTAL_
USE_DYNAMICALLY_INDEXED_MULTI_INDEX
#include "array_multi_index.hpp"
#else
#include "statically_indexed_array_multi_index.hpp"
...
...
include/ck/utility/reduction_enums.hpp
View file @
07a673c6
...
...
@@ -28,7 +28,7 @@
namespace
ck
{
enum
class
ReduceTensorOp
_t
enum
struct
ReduceTensorOp
{
ADD
=
0
,
MUL
=
1
,
...
...
@@ -41,19 +41,19 @@ enum class ReduceTensorOp_t
// MUL_NO_ZEROS = 8,
};
enum
class
NanPropagation
_t
enum
struct
NanPropagation
{
NOT_PROPAGATE_NAN
=
0
,
PROPAGATE_NAN
=
1
,
};
enum
class
ReduceTensorIndices
_t
enum
struct
ReduceTensorIndices
{
NO_INDICES
=
0
,
FLATTENED_INDICES
=
1
,
};
enum
class
IndicesType
_t
enum
struct
IndicesType
{
INDICES_32BIT
=
0
,
INDICES_64BIT
=
1
,
...
...
include/ck/utility/static_buffer.hpp
View file @
07a673c6
...
...
@@ -6,7 +6,7 @@
namespace
ck
{
// static buffer for scalar
template
<
AddressSpaceEnum
_t
AddressSpace
,
template
<
AddressSpaceEnum
AddressSpace
,
typename
T
,
index_t
N
,
bool
InvalidElementUseNumericalZeroValue
>
// TODO remove this bool, no longer needed
...
...
@@ -17,10 +17,7 @@ struct StaticBuffer : public StaticallyIndexedArray<T, N>
__host__
__device__
constexpr
StaticBuffer
()
:
base
{}
{}
__host__
__device__
static
constexpr
AddressSpaceEnum_t
GetAddressSpace
()
{
return
AddressSpace
;
}
__host__
__device__
static
constexpr
AddressSpaceEnum
GetAddressSpace
()
{
return
AddressSpace
;
}
__host__
__device__
static
constexpr
bool
IsStaticBuffer
()
{
return
true
;
}
...
...
@@ -42,7 +39,7 @@ struct StaticBuffer : public StaticallyIndexedArray<T, N>
};
// static buffer for vector
template
<
AddressSpaceEnum
_t
AddressSpace
,
template
<
AddressSpaceEnum
AddressSpace
,
typename
S
,
index_t
NumOfVector
,
index_t
ScalarPerVector
,
...
...
@@ -59,10 +56,7 @@ struct StaticBufferTupleOfVector
__host__
__device__
constexpr
StaticBufferTupleOfVector
()
:
base
{}
{}
__host__
__device__
static
constexpr
AddressSpaceEnum_t
GetAddressSpace
()
{
return
AddressSpace
;
}
__host__
__device__
static
constexpr
AddressSpaceEnum
GetAddressSpace
()
{
return
AddressSpace
;
}
__host__
__device__
static
constexpr
bool
IsStaticBuffer
()
{
return
true
;
}
...
...
@@ -158,7 +152,7 @@ struct StaticBufferTupleOfVector
}
};
template
<
AddressSpaceEnum
_t
AddressSpace
,
typename
T
,
index_t
N
>
template
<
AddressSpaceEnum
AddressSpace
,
typename
T
,
index_t
N
>
__host__
__device__
constexpr
auto
make_static_buffer
(
Number
<
N
>
)
{
return
StaticBuffer
<
AddressSpace
,
T
,
N
,
true
>
{};
...
...
include/ck/utility/synchronization.hpp
View file @
07a673c6
...
...
@@ -7,7 +7,7 @@ namespace ck {
__device__
void
block_sync_lds
()
{
#if CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
#if CK_
EXPERIMENTAL_
BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
asm
volatile
(
"\
s_waitcnt lgkmcnt(0)
\n
\
s_barrier \
...
...
library/include/ck/library/host_tensor/conv_common.hpp
View file @
07a673c6
...
...
@@ -75,14 +75,14 @@ calculate_convolution_flops(const InDesc&, const WeiDesc& wei_desc, const OutDes
}
template
<
typename
T
>
inline
auto
activ
(
T
v
,
const
ck
::
ActivTypeEnum
_t
activ_type
)
inline
auto
activ
(
T
v
,
const
ck
::
ActivTypeEnum
activ_type
)
{
const
T
alpha
=
0.3
;
switch
(
activ_type
)
{
case
ck
::
ActivTypeEnum
_t
::
None
:
return
v
;
case
ck
::
ActivTypeEnum
_t
::
LeakyRelu
:
return
(
v
>=
0
?
v
:
alpha
*
v
);
case
ck
::
ActivTypeEnum
_t
::
Sigmoid
:
return
(
1
/
(
1
+
exp
(
-
v
)));
case
ck
::
ActivTypeEnum
::
None
:
return
v
;
case
ck
::
ActivTypeEnum
::
LeakyRelu
:
return
(
v
>=
0
?
v
:
alpha
*
v
);
case
ck
::
ActivTypeEnum
::
Sigmoid
:
return
(
1
/
(
1
+
exp
(
-
v
)));
default:
throw
std
::
runtime_error
(
"unsupported activ type"
);
break
;
}
}
...
...
library/include/ck/library/host_tensor/device_tensor.hpp
View file @
07a673c6
#pragma once
#include "host_tensor.hpp"
#include "common_header.hpp"
template
<
typename
TensorDesc
>
void
ostream_tensor_descriptor
(
TensorDesc
,
std
::
ostream
&
os
=
std
::
cout
)
...
...
library/include/ck/library/host_tensor/host_reduce_util.hpp
View file @
07a673c6
...
...
@@ -39,8 +39,8 @@ namespace ck {
namespace
host_reduce
{
using
ck
::
NanPropagation
_t
;
using
ck
::
ReduceTensorOp
_t
;
using
ck
::
NanPropagation
;
using
ck
::
ReduceTensorOp
;
template
<
typename
T
>
static
inline
bool
float_equal_one
(
T
);
...
...
@@ -66,44 +66,44 @@ static inline bool float_equal_zero(half_float::half x)
return
x
==
static_cast
<
half_float
::
half
>
(
0.0
f
);
};
template
<
typename
AccDataType
,
ReduceTensorOp
_t
ReduceOpId
>
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccDataType
&
)
>
PreUnaryOpFn
(
int
)
{
using
std
::
abs
;
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
NORM1
)
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
NORM1
)
{
return
([
&
](
AccDataType
&
a_
)
{
a_
=
abs
(
a_
);
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
NORM2
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
NORM2
)
{
return
([
&
](
AccDataType
&
a_
)
{
a_
=
a_
*
a_
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
AMAX
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
AMAX
)
{
return
([
&
](
AccDataType
&
a_
)
{
a_
=
abs
(
a_
);
});
}
else
{
// ReduceTensorOp
_t
::AVG:
// ReduceTensorOp
_t
::ADD:
// ReduceTensorOp
_t
::MUL:
// ReduceTensorOp
_t
::MIN:
// ReduceTensorOp
_t
::MAX:
// ReduceTensorOp::AVG:
// ReduceTensorOp::ADD:
// ReduceTensorOp::MUL:
// ReduceTensorOp::MIN:
// ReduceTensorOp::MAX:
return
([
&
](
AccDataType
&
)
{});
};
};
template
<
typename
AccDataType
,
ReduceTensorOp
_t
ReduceOpId
>
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccDataType
&
)
>
PosUnaryOpFn
(
int32_t
divider
)
{
using
std
::
sqrt
;
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
NORM2
)
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
NORM2
)
{
return
([
&
](
AccDataType
&
a_
)
{
a_
=
sqrt
(
a_
);
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
AVG
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
AVG
)
{
return
([
&
,
divider
](
AccDataType
&
a_
)
{
a_
=
a_
/
static_cast
<
AccDataType
>
(
static_cast
<
float
>
(
divider
));
...
...
@@ -111,36 +111,36 @@ __host__ static inline std::function<void(AccDataType&)> PosUnaryOpFn(int32_t di
}
else
{
// ReduceTensorOp
_t
::ADD:
// ReduceTensorOp
_t
::NORM1:
// ReduceTensorOp
_t
::MUL:
// ReduceTensorOp
_t
::MIN:
// ReduceTensorOp
_t
::MAX:
// ReduceTensorOp
_t
::AMAX:
// ReduceTensorOp::ADD:
// ReduceTensorOp::NORM1:
// ReduceTensorOp::MUL:
// ReduceTensorOp::MIN:
// ReduceTensorOp::MAX:
// ReduceTensorOp::AMAX:
return
([
&
](
AccDataType
&
)
{});
}
};
template
<
typename
AccDataType
,
ReduceTensorOp
_t
ReduceOpId
>
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccDataType
&
,
AccDataType
)
>
ReduceOpFn
()
{
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
ADD
||
ReduceOpId
==
ReduceTensorOp
_t
::
AVG
||
ReduceOpId
==
ReduceTensorOp
_t
::
NORM1
||
ReduceOpId
==
ReduceTensorOp
_t
::
NORM2
)
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
ADD
||
ReduceOpId
==
ReduceTensorOp
::
AVG
||
ReduceOpId
==
ReduceTensorOp
::
NORM1
||
ReduceOpId
==
ReduceTensorOp
::
NORM2
)
{
return
([
&
](
AccDataType
&
a_
,
AccDataType
b_
)
{
a_
=
a_
+
b_
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
MUL
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MUL
)
{
return
([
&
](
AccDataType
&
a_
,
AccDataType
b_
)
{
a_
=
a_
*
b_
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
MIN
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MIN
)
{
return
([
&
](
AccDataType
&
a_
,
AccDataType
b_
)
{
if
(
a_
>
b_
)
a_
=
b_
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
MAX
||
ReduceOpId
==
ReduceTensorOp
_t
::
AMAX
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
)
{
return
([
&
](
AccDataType
&
a_
,
AccDataType
b_
)
{
if
(
a_
<
b_
)
...
...
@@ -149,10 +149,10 @@ __host__ static inline std::function<void(AccDataType&, AccDataType)> ReduceOpFn
}
};
template
<
typename
AccDataType
,
ReduceTensorOp
_t
ReduceOpId
>
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccDataType
&
,
AccDataType
,
bool
&
changed
)
>
ReduceOpFn2
()
{
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
MIN
)
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MIN
)
{
return
([
&
](
AccDataType
&
a_
,
AccDataType
b_
,
bool
&
changed
)
{
if
(
a_
>
b_
)
...
...
@@ -164,7 +164,7 @@ __host__ static inline std::function<void(AccDataType&, AccDataType, bool& chang
changed
=
false
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
MAX
||
ReduceOpId
==
ReduceTensorOp
_t
::
AMAX
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
)
{
return
([
&
](
AccDataType
&
a_
,
AccDataType
b_
,
bool
&
changed
)
{
if
(
a_
<
b_
)
...
...
@@ -178,40 +178,40 @@ __host__ static inline std::function<void(AccDataType&, AccDataType, bool& chang
}
else
{
// ReduceTensorOp
_t
::ADD:
// ReduceTensorOp
_t
::MUL:
// ReduceTensorOp
_t
::AVG:
// ReduceTensorOp
_t
::NORM1:
// ReduceTensorOp
_t
::NORM2:
// ReduceTensorOp::ADD:
// ReduceTensorOp::MUL:
// ReduceTensorOp::AVG:
// ReduceTensorOp::NORM1:
// ReduceTensorOp::NORM2:
return
(
std
::
function
<
void
(
AccDataType
&
,
AccDataType
,
bool
&
)
>
{});
};
};
template
<
typename
AccDataType
,
ReduceTensorOp
_t
ReduceOpId
>
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
AccDataType
ReduceOpZeroVal
()
{
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
MUL
)
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MUL
)
{
return
(
static_cast
<
AccDataType
>
(
1.0
f
));
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
MIN
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MIN
)
{
return
(
std
::
numeric_limits
<
AccDataType
>::
max
());
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
MAX
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MAX
)
{
return
(
std
::
numeric_limits
<
AccDataType
>::
lowest
());
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
_t
::
AMAX
)
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
AMAX
)
{
return
(
static_cast
<
AccDataType
>
(
0.0
f
));
}
else
{
// ReduceTensorOp
_t
::ADD
// ReduceTensorOp
_t
::AVG
// ReduceTensorOp
_t
::NORM1
// ReduceTensorOp
_t
::NORM2
// ReduceTensorOp::ADD
// ReduceTensorOp::AVG
// ReduceTensorOp::NORM1
// ReduceTensorOp::NORM2
return
(
static_cast
<
AccDataType
>
(
0.0
f
));
};
};
...
...
library/include/ck/library/host_tensor/host_reduction.hpp
View file @
07a673c6
...
...
@@ -104,7 +104,7 @@ static size_t get_offset_from_index(const std::vector<size_t>& strides,
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
ck
::
ReduceTensorOp
_t
ReduceOpId
,
ck
::
ReduceTensorOp
ReduceOpId
,
int
Rank
,
int
NumReduceDim
,
bool
PropagateNan
,
...
...
library/include/ck/library/host_tensor/host_tensor.hpp
View file @
07a673c6
...
...
@@ -300,9 +300,6 @@ HostTensorDescriptor::HostTensorDescriptor(const std::vector<X>& lens,
void
ostream_HostTensorDescriptor
(
const
HostTensorDescriptor
&
desc
,
std
::
ostream
&
os
=
std
::
cout
);
#if 1
// FIXME: remove
float
bf16_to_f32_
(
ck
::
bhalf_t
src_val
);
// FIXME: remove
void
bf16_to_f32_
(
const
Tensor
<
ck
::
bhalf_t
>&
src
,
Tensor
<
float
>&
dst
);
#endif
...
...
@@ -353,28 +350,4 @@ float check_error(const Tensor<T>& ref, const Tensor<T>& result)
return
linf_error
;
}
template
<
typename
T
>
void
check_indices
(
const
Tensor
<
T
>&
ref
,
const
Tensor
<
T
>&
result
)
{
bool
has_error
=
false
;
int
error_count
=
0
;
for
(
int
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
{
if
(
ref
.
mData
[
i
]
!=
result
.
mData
[
i
])
{
std
::
cerr
<<
std
::
endl
<<
"Indices different at position "
<<
i
<<
" (ref: "
<<
ref
.
mData
[
i
]
<<
", result: "
<<
result
.
mData
[
i
]
<<
")"
<<
std
::
endl
;
has_error
=
true
;
error_count
++
;
if
(
error_count
==
20
)
break
;
};
}
if
(
!
has_error
)
std
::
cout
<<
std
::
endl
<<
"Indices result is completely acccurate!"
<<
std
::
endl
;
}
#endif
library/include/ck/library/obselete_driver_offline/device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
View file @
07a673c6
...
...
@@ -6,7 +6,7 @@
template
<
typename
TInWei
,
typename
TAcc
,
typename
TOut
,
ck
::
ActivTypeEnum
_t
activ_type
,
ck
::
ActivTypeEnum
activ_type
,
typename
InLengths
,
typename
WeiLengths
,
typename
AddLengths
,
...
...
library/include/ck/library/obselete_driver_offline/device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
View file @
07a673c6
...
...
@@ -231,7 +231,7 @@ void device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk(
TInWei
,
TAcc
,
TOut
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
decltype
(
wei_gemmk0_gemmm_gemmk1_grid_desc
),
decltype
(
out_gemmk0_gemmn_gemmk1_grid_desc
),
decltype
(
in_gemmm_gemmn_grid_desc
),
...
...
library/include/ck/library/obselete_driver_offline/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
View file @
07a673c6
...
...
@@ -338,7 +338,7 @@ void device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
TInWei
,
TAcc
,
TOut
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
decltype
(
out_gemmk0_gemmm_gemmk1_grid_desc
),
decltype
(
wei_gemmk0_gemmn_gemmk1_grid_desc
),
decltype
(
in_gemmm_gemmn_grid_desc
),
...
...
library/include/ck/library/obselete_driver_offline/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1.hpp
View file @
07a673c6
...
...
@@ -307,7 +307,7 @@ void device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
TInWei
,
TAcc
,
TOut
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
decltype
(
out_gemmk0_gemmm_gemmk1_grid_desc
),
decltype
(
wei_gemmk0_gemmn_gemmk1_grid_desc
),
decltype
(
in_gemmm_gemmn_grid_desc
),
...
...
library/include/ck/library/obselete_driver_offline/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_atomic_nchw_kcyx_nkhw.hpp
View file @
07a673c6
...
...
@@ -171,7 +171,7 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_atomic_nchw_
TIn
,
TAcc
,
TWei
,
InMemoryDataOperationEnum
_t
::
AtomicAdd
,
InMemoryDataOperationEnum
::
AtomicAdd
,
decltype
(
out_gemmk0_gemmm_gemmk1_grid_desc
),
decltype
(
in_gemmk0_gemmn_gemmk1_grid_desc
),
decltype
(
wei_gemmm_gemmn_grid_desc
),
...
...
library/include/ck/library/obselete_driver_offline/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
View file @
07a673c6
...
...
@@ -168,7 +168,7 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
TIn
,
TAcc
,
TWei
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
decltype
(
out_gemmk0_gemmm_gemmk1_grid_desc
),
decltype
(
in_gemmk0_gemmn_gemmk1_grid_desc
),
decltype
(
wei_gemmm_gemmn_grid_desc
),
...
...
library/include/ck/library/obselete_driver_offline/device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_atomic_nhwc_kyxc_nhwk.hpp
View file @
07a673c6
...
...
@@ -200,7 +200,7 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_atomic_nhwc_
TIn
,
TAcc
,
TWei
,
InMemoryDataOperationEnum
_t
::
AtomicAdd
,
InMemoryDataOperationEnum
::
AtomicAdd
,
decltype
(
in_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc
),
decltype
(
out_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc
),
decltype
(
wei_gemmm_gemmn_grid_desc
),
...
...
Prev
1
…
3
4
5
6
7
8
9
10
11
…
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment