Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
c98cbea0
Commit
c98cbea0
authored
Sep 26, 2020
by
Chao Liu
Browse files
refactoring array type
parent
674c405f
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
146 additions
and
51 deletions
+146
-51
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform_v2.hpp
...l/include/kernel_algorithm/dummy_dynamic_transform_v2.hpp
+7
-6
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+1
-1
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
...lude/tensor_description/dynamic_multi_index_transform.hpp
+14
-15
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper_v2.hpp
...ensor_description/dynamic_tensor_descriptor_helper_v2.hpp
+2
-8
composable_kernel/include/tensor_description/multi_index.hpp
composable_kernel/include/tensor_description/multi_index.hpp
+37
-0
composable_kernel/include/tensor_description/multi_index_transform.hpp
...rnel/include/tensor_description/multi_index_transform.hpp
+1
-15
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
...l/include/tensor_description/tensor_descriptor_helper.hpp
+1
-1
composable_kernel/include/utility/statically_indexed_array.hpp
...sable_kernel/include/utility/statically_indexed_array.hpp
+66
-0
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+1
-1
script/cmake-rocm3.7.sh
script/cmake-rocm3.7.sh
+3
-4
script/hipclang_opt.sh
script/hipclang_opt.sh
+13
-0
No files found.
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform_v2.hpp
View file @
c98cbea0
...
@@ -68,17 +68,18 @@ map_convolution_into_gemm_v2(const WeiDesc& wei_k_c_y_x_global_desc,
...
@@ -68,17 +68,18 @@ map_convolution_into_gemm_v2(const WeiDesc& wei_k_c_y_x_global_desc,
const
auto
in_n_c_y_ho_x_wo_global_desc
=
transform_dynamic_tensor_descriptor_v2
(
const
auto
in_n_c_y_ho_x_wo_global_desc
=
transform_dynamic_tensor_descriptor_v2
(
in_n_c_hip_wip_global_desc
,
in_n_c_hip_wip_global_desc
,
make_tuple
(
DynamicPassThrough
{
N
},
make_tuple
(
DynamicPassThrough
{
C
},
DynamicPassThrough
{
N
},
DynamicEmbed
<
2
>
{{
Y
,
Ho
},
{
ConvDilationH
,
ConvStrideH
,
0
}},
DynamicPassThrough
{
C
},
DynamicEmbed
<
2
>
{{
X
,
Wo
},
{
ConvDilationW
,
ConvStrideW
,
0
}}),
DynamicEmbed
<
2
>
{
make_multi_index
(
Y
,
Ho
),
make_multi_index
(
ConvDilationH
,
ConvStrideH
)},
DynamicEmbed
<
2
>
{
make_multi_index
(
X
,
Wo
),
make_multi_index
(
ConvDilationW
,
ConvStrideW
)}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
const
auto
in_gemmk_gemmn_global_desc
=
transform_dynamic_tensor_descriptor_v2
(
const
auto
in_gemmk_gemmn_global_desc
=
transform_dynamic_tensor_descriptor_v2
(
in_n_c_y_ho_x_wo_global_desc
,
in_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
DynamicMerge
<
3
>
{
M
ulti
I
ndex
<
3
>
{{
C
,
Y
,
X
}}
},
make_tuple
(
DynamicMerge
<
3
>
{
make_m
ulti
_i
ndex
(
C
,
Y
,
X
)
},
DynamicMerge
<
3
>
{
M
ulti
I
ndex
<
3
>
{{
N
,
Ho
,
Wo
}}
}),
DynamicMerge
<
3
>
{
make_m
ulti
_i
ndex
(
N
,
Ho
,
Wo
)
}),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
c98cbea0
...
@@ -183,7 +183,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -183,7 +183,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
AddressSpace
::
Vgpr
,
AddressSpace
::
Vgpr
,
AddressSpace
::
Lds
,
AddressSpace
::
Lds
,
InMemoryDataOperation
::
Set
>
(
InMemoryDataOperation
::
Set
>
(
MultiIndex
<
4
>
{
{
0
,
0
,
b_block_data_on_global
,
0
}
}
,
MultiIndex
<
4
>
{
{
0
,
0
,
0
,
0
}
}
);
MultiIndex
<
4
>
{
0
,
0
,
b_block_data_on_global
,
0
},
MultiIndex
<
4
>
{
0
,
0
,
0
,
0
});
// weight tensor
// weight tensor
// global tensor in global memory, src of blockwise copy
// global tensor in global memory, src of blockwise copy
...
...
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
View file @
c98cbea0
...
@@ -2,6 +2,7 @@
...
@@ -2,6 +2,7 @@
#define CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HPP
#define CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HPP
#include "common_header.hpp"
#include "common_header.hpp"
#include "multi_index.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -13,11 +14,11 @@ struct DynamicPassThrough
...
@@ -13,11 +14,11 @@ struct DynamicPassThrough
const
UpperIndex
up_lengths_
;
const
UpperIndex
up_lengths_
;
__host__
__device__
explicit
constexpr
DynamicPassThrough
(
const
index_t
&
low_length
)
__host__
__device__
explicit
constexpr
DynamicPassThrough
(
const
index_t
&
low_length
)
:
up_lengths_
{
{
low_length
}
}
:
up_lengths_
{
low_length
}
{
{
}
}
__host__
__device__
explicit
constexpr
DynamicPassThrough
()
:
up_lengths_
{
{
0
}
}
{}
__host__
__device__
explicit
constexpr
DynamicPassThrough
()
:
up_lengths_
{
0
}
{}
__host__
__device__
static
constexpr
index_t
GetNumOfLowerDimension
()
{
return
1
;
}
__host__
__device__
static
constexpr
index_t
GetNumOfLowerDimension
()
{
return
1
;
}
...
@@ -73,11 +74,11 @@ struct DynamicLeftPad
...
@@ -73,11 +74,11 @@ struct DynamicLeftPad
__host__
__device__
explicit
constexpr
DynamicLeftPad
(
const
index_t
&
low_length
,
__host__
__device__
explicit
constexpr
DynamicLeftPad
(
const
index_t
&
low_length
,
const
index_t
&
left_pad
)
const
index_t
&
left_pad
)
:
up_lengths_
{
{
low_length
+
left_pad
}
}
,
left_pad_
{
left_pad
}
:
up_lengths_
{
low_length
+
left_pad
},
left_pad_
{
left_pad
}
{
{
}
}
__host__
__device__
explicit
constexpr
DynamicLeftPad
()
:
up_lengths_
{
{
0
}
},
left_pad_
{
0
}
{}
__host__
__device__
explicit
constexpr
DynamicLeftPad
()
:
up_lengths_
{
0
},
left_pad_
{
0
}
{}
__host__
__device__
static
constexpr
index_t
GetNumOfLowerDimension
()
{
return
1
;
}
__host__
__device__
static
constexpr
index_t
GetNumOfLowerDimension
()
{
return
1
;
}
...
@@ -136,12 +137,12 @@ struct DynamicRightPad
...
@@ -136,12 +137,12 @@ struct DynamicRightPad
__host__
__device__
explicit
constexpr
DynamicRightPad
(
const
index_t
&
low_length
,
__host__
__device__
explicit
constexpr
DynamicRightPad
(
const
index_t
&
low_length
,
const
index_t
&
right_pad
)
const
index_t
&
right_pad
)
:
up_lengths_
{
{
low_length
+
right_pad
}
}
,
low_length_
{
low_length
},
right_pad_
{
right_pad
}
:
up_lengths_
{
low_length
+
right_pad
},
low_length_
{
low_length
},
right_pad_
{
right_pad
}
{
{
}
}
__host__
__device__
explicit
constexpr
DynamicRightPad
()
__host__
__device__
explicit
constexpr
DynamicRightPad
()
:
up_lengths_
{
{
0
}
},
low_length_
{
0
},
right_pad_
{
0
}
:
up_lengths_
{
0
},
low_length_
{
0
},
right_pad_
{
0
}
{
{
}
}
...
@@ -190,8 +191,7 @@ struct DynamicRightPad
...
@@ -190,8 +191,7 @@ struct DynamicRightPad
}
}
};
};
// idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1] +
// idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1]
// coefficients[nDimUp]
template
<
index_t
NDimUp
>
template
<
index_t
NDimUp
>
struct
DynamicEmbed
struct
DynamicEmbed
{
{
...
@@ -199,11 +199,10 @@ struct DynamicEmbed
...
@@ -199,11 +199,10 @@ struct DynamicEmbed
using
UpperIndex
=
MultiIndex
<
NDimUp
>
;
using
UpperIndex
=
MultiIndex
<
NDimUp
>
;
const
UpperIndex
up_lengths_
;
const
UpperIndex
up_lengths_
;
const
Array
<
index_t
,
NDimUp
+
1
>
coefficients_
;
const
UpperIndex
coefficients_
;
__host__
__host__
__device__
explicit
constexpr
DynamicEmbed
(
const
UpperIndex
&
up_lengths
,
__device__
explicit
constexpr
DynamicEmbed
(
const
UpperIndex
&
up_lengths
,
const
UpperIndex
&
coefficients
)
const
Array
<
index_t
,
NDimUp
+
1
>&
coefficients
)
:
up_lengths_
{
up_lengths
},
coefficients_
{
coefficients
}
:
up_lengths_
{
up_lengths
},
coefficients_
{
coefficients
}
{
{
static_assert
(
UpperIndex
::
Size
()
==
NDimUp
,
"wrong! # of dimensions not consistent"
);
static_assert
(
UpperIndex
::
Size
()
==
NDimUp
,
"wrong! # of dimensions not consistent"
);
...
@@ -211,7 +210,7 @@ struct DynamicEmbed
...
@@ -211,7 +210,7 @@ struct DynamicEmbed
__host__
__device__
explicit
constexpr
DynamicEmbed
()
__host__
__device__
explicit
constexpr
DynamicEmbed
()
:
up_lengths_
{
make_zero_array
<
index_t
,
NDimUp
>
()},
:
up_lengths_
{
make_zero_array
<
index_t
,
NDimUp
>
()},
coefficients_
{
make_zero_array
<
index_t
,
NDimUp
+
1
>
()}
coefficients_
{
make_zero_array
<
index_t
,
NDimUp
>
()}
{
{
}
}
...
@@ -228,7 +227,7 @@ struct DynamicEmbed
...
@@ -228,7 +227,7 @@ struct DynamicEmbed
static_assert
(
LowIdx
::
Size
()
==
1
&&
UpIdx
::
Size
()
==
NDimUp
,
static_assert
(
LowIdx
::
Size
()
==
1
&&
UpIdx
::
Size
()
==
NDimUp
,
"wrong! inconsistent # of dimension"
);
"wrong! inconsistent # of dimension"
);
idx_low
(
Number
<
0
>
{})
=
coefficients_
[
Number
<
NDimUp
>
{}]
;
idx_low
(
Number
<
0
>
{})
=
0
;
static_for
<
0
,
NDimUp
,
1
>
{}([
&
idx_low
,
&
idx_up
,
this
](
auto
i
)
{
static_for
<
0
,
NDimUp
,
1
>
{}([
&
idx_low
,
&
idx_up
,
this
](
auto
i
)
{
idx_low
(
Number
<
0
>
{})
+=
idx_up
[
i
]
*
this
->
coefficients_
[
i
];
idx_low
(
Number
<
0
>
{})
+=
idx_up
[
i
]
*
this
->
coefficients_
[
i
];
...
@@ -288,7 +287,7 @@ struct DynamicMerge
...
@@ -288,7 +287,7 @@ struct DynamicMerge
__host__
__device__
explicit
constexpr
DynamicMerge
()
__host__
__device__
explicit
constexpr
DynamicMerge
()
:
low_lengths_
{
make_zero_array
<
index_t
,
NDimLow
>
()},
:
low_lengths_
{
make_zero_array
<
index_t
,
NDimLow
>
()},
low_lengths_scan_
{
make_zero_array
<
index_t
,
NDimLow
>
()},
low_lengths_scan_
{
make_zero_array
<
index_t
,
NDimLow
>
()},
up_lengths_
{
{
0
}
}
up_lengths_
{
0
}
{
{
}
}
...
...
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper_v2.hpp
View file @
c98cbea0
...
@@ -31,9 +31,7 @@ template <index_t N>
...
@@ -31,9 +31,7 @@ template <index_t N>
__host__
__device__
constexpr
auto
__host__
__device__
constexpr
auto
make_dynamic_native_tensor_descriptor_v2
(
const
MultiIndex
<
N
>&
lengths
,
const
MultiIndex
<
N
>&
strides
)
make_dynamic_native_tensor_descriptor_v2
(
const
MultiIndex
<
N
>&
lengths
,
const
MultiIndex
<
N
>&
strides
)
{
{
const
auto
coefficients
=
strides
.
PushBack
(
index_t
{
0
});
const
auto
transforms
=
make_tuple
(
DynamicEmbed
<
N
>
{
lengths
,
strides
});
const
auto
transforms
=
make_tuple
(
DynamicEmbed
<
N
>
{
lengths
,
coefficients
});
constexpr
auto
low_dim_hidden_idss
=
make_tuple
(
Sequence
<
0
>
{});
constexpr
auto
low_dim_hidden_idss
=
make_tuple
(
Sequence
<
0
>
{});
constexpr
auto
up_dim_hidden_idss
=
constexpr
auto
up_dim_hidden_idss
=
make_tuple
(
typename
arithmetic_sequence_gen
<
1
,
N
+
1
,
1
>::
type
{});
make_tuple
(
typename
arithmetic_sequence_gen
<
1
,
N
+
1
,
1
>::
type
{});
...
@@ -41,11 +39,7 @@ make_dynamic_native_tensor_descriptor_v2(const MultiIndex<N>& lengths, const Mul
...
@@ -41,11 +39,7 @@ make_dynamic_native_tensor_descriptor_v2(const MultiIndex<N>& lengths, const Mul
index_t
element_space_size
=
1
;
index_t
element_space_size
=
1
;
#pragma unroll
static_for
<
0
,
N
,
1
>
{}([
&
](
auto
i
)
{
element_space_size
+=
(
lengths
[
i
]
-
1
)
*
strides
[
i
];
});
for
(
index_t
i
=
0
;
i
<
N
;
++
i
)
{
element_space_size
+=
(
lengths
[
i
]
-
1
)
*
strides
[
i
];
}
return
DynamicTensorDescriptor_v2
<
decltype
(
transforms
),
return
DynamicTensorDescriptor_v2
<
decltype
(
transforms
),
decltype
(
low_dim_hidden_idss
),
decltype
(
low_dim_hidden_idss
),
...
...
composable_kernel/include/tensor_description/multi_index.hpp
0 → 100644
View file @
c98cbea0
#ifndef CK_MULTI_INDEX_HPP
#define CK_MULTI_INDEX_HPP
#include "common_header.hpp"
namespace
ck
{
#if 1 // debug
template
<
index_t
N
>
using
MultiIndex
=
Array
<
index_t
,
N
>
;
template
<
typename
...
Xs
>
__host__
__device__
constexpr
auto
make_multi_index
(
Xs
...
xs
)
{
return
MultiIndex
<
sizeof
...(
Xs
)
>
{{
static_cast
<
index_t
>
(
xs
)...}};
}
#else
template
<
index_t
N
>
using
MultiIndex
=
StaticallyIndexedArray
<
index_t
,
N
>
;
template
<
typename
...
Xs
>
__host__
__device__
constexpr
auto
make_multi_index
(
Xs
...
xs
)
{
return
MultiIndex
<
sizeof
...(
Xs
)
>
(
static_cast
<
index_t
>
(
xs
)...
r
);
}
#endif
template
<
index_t
NSize
>
__host__
__device__
constexpr
auto
make_zero_multi_index
()
{
return
unpack
([](
auto
...
xs
)
{
return
make_multi_index
(
xs
...);
},
typename
uniform_sequence_gen
<
NSize
,
0
>::
type
{});
}
}
// namespace ck
#endif
composable_kernel/include/tensor_description/multi_index_transform.hpp
View file @
c98cbea0
...
@@ -2,24 +2,10 @@
...
@@ -2,24 +2,10 @@
#define CK_MULTI_INDEX_TRANSFORM_HPP
#define CK_MULTI_INDEX_TRANSFORM_HPP
#include "common_header.hpp"
#include "common_header.hpp"
#include "multi_index.hpp"
namespace
ck
{
namespace
ck
{
template
<
index_t
N
>
using
MultiIndex
=
Array
<
index_t
,
N
>
;
template
<
typename
...
Xs
>
__host__
__device__
constexpr
auto
make_multi_index
(
Xs
...
xs
)
{
return
MultiIndex
<
sizeof
...(
Xs
)
>
(
xs
...);
}
template
<
index_t
NSize
>
__host__
__device__
constexpr
auto
make_zero_multi_index
()
{
return
make_zero_array
<
index_t
,
NSize
>
();
}
template
<
index_t
Length
>
template
<
index_t
Length
>
struct
PassThrough
struct
PassThrough
{
{
...
...
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
View file @
c98cbea0
...
@@ -196,7 +196,7 @@ struct ClusterDescriptor
...
@@ -196,7 +196,7 @@ struct ClusterDescriptor
__host__
__device__
static
constexpr
auto
CalculateClusterIndex
(
index_t
idx_1d
)
__host__
__device__
static
constexpr
auto
CalculateClusterIndex
(
index_t
idx_1d
)
{
{
return
mDesc
.
CalculateLowerIndex
(
MultiIndex
<
1
>
{
{
idx_1d
}
}
);
return
mDesc
.
CalculateLowerIndex
(
MultiIndex
<
1
>
{
idx_1d
});
}
}
};
};
...
...
composable_kernel/include/utility/statically_indexed_array.hpp
View file @
c98cbea0
...
@@ -16,48 +16,96 @@ template <typename TData>
...
@@ -16,48 +16,96 @@ template <typename TData>
struct
StaticallyIndexedArray
<
TData
,
0
>
:
Tuple
<>
struct
StaticallyIndexedArray
<
TData
,
0
>
:
Tuple
<>
{
{
using
data_type
=
TData
;
using
data_type
=
TData
;
using
base
=
Tuple
<>
;
template
<
typename
...
Ys
>
__host__
__device__
explicit
constexpr
StaticallyIndexedArray
(
Ys
&&
...
ys
)
:
base
(
ys
...)
{
}
};
};
template
<
typename
TData
>
template
<
typename
TData
>
struct
StaticallyIndexedArray
<
TData
,
1
>
:
Tuple
<
TData
>
struct
StaticallyIndexedArray
<
TData
,
1
>
:
Tuple
<
TData
>
{
{
using
data_type
=
TData
;
using
data_type
=
TData
;
using
base
=
Tuple
<
TData
>
;
template
<
typename
...
Ys
>
__host__
__device__
explicit
constexpr
StaticallyIndexedArray
(
Ys
&&
...
ys
)
:
base
(
ys
...)
{
}
};
};
template
<
typename
TData
>
template
<
typename
TData
>
struct
StaticallyIndexedArray
<
TData
,
2
>
:
Tuple
<
TData
,
TData
>
struct
StaticallyIndexedArray
<
TData
,
2
>
:
Tuple
<
TData
,
TData
>
{
{
using
data_type
=
TData
;
using
data_type
=
TData
;
using
base
=
Tuple
<
TData
,
TData
>
;
template
<
typename
...
Ys
>
__host__
__device__
explicit
constexpr
StaticallyIndexedArray
(
Ys
&&
...
ys
)
:
base
(
ys
...)
{
}
};
};
template
<
typename
TData
>
template
<
typename
TData
>
struct
StaticallyIndexedArray
<
TData
,
3
>
:
Tuple
<
TData
,
TData
,
TData
>
struct
StaticallyIndexedArray
<
TData
,
3
>
:
Tuple
<
TData
,
TData
,
TData
>
{
{
using
data_type
=
TData
;
using
data_type
=
TData
;
using
base
=
Tuple
<
TData
,
TData
,
TData
>
;
template
<
typename
...
Ys
>
__host__
__device__
explicit
constexpr
StaticallyIndexedArray
(
Ys
&&
...
ys
)
:
base
(
ys
...)
{
}
};
};
template
<
typename
TData
>
template
<
typename
TData
>
struct
StaticallyIndexedArray
<
TData
,
4
>
:
Tuple
<
TData
,
TData
,
TData
,
TData
>
struct
StaticallyIndexedArray
<
TData
,
4
>
:
Tuple
<
TData
,
TData
,
TData
,
TData
>
{
{
using
data_type
=
TData
;
using
data_type
=
TData
;
using
base
=
Tuple
<
TData
,
TData
,
TData
,
TData
>
;
template
<
typename
...
Ys
>
__host__
__device__
explicit
constexpr
StaticallyIndexedArray
(
Ys
&&
...
ys
)
:
base
(
ys
...)
{
}
};
};
template
<
typename
TData
>
template
<
typename
TData
>
struct
StaticallyIndexedArray
<
TData
,
5
>
:
Tuple
<
TData
,
TData
,
TData
,
TData
,
TData
>
struct
StaticallyIndexedArray
<
TData
,
5
>
:
Tuple
<
TData
,
TData
,
TData
,
TData
,
TData
>
{
{
using
data_type
=
TData
;
using
data_type
=
TData
;
using
base
=
Tuple
<
TData
,
TData
,
TData
,
TData
,
TData
>
;
template
<
typename
...
Ys
>
__host__
__device__
explicit
constexpr
StaticallyIndexedArray
(
Ys
&&
...
ys
)
:
base
(
ys
...)
{
}
};
};
template
<
typename
TData
>
template
<
typename
TData
>
struct
StaticallyIndexedArray
<
TData
,
6
>
:
Tuple
<
TData
,
TData
,
TData
,
TData
,
TData
,
TData
>
struct
StaticallyIndexedArray
<
TData
,
6
>
:
Tuple
<
TData
,
TData
,
TData
,
TData
,
TData
,
TData
>
{
{
using
data_type
=
TData
;
using
data_type
=
TData
;
using
base
=
Tuple
<
TData
,
TData
,
TData
,
TData
,
TData
,
TData
>
;
template
<
typename
...
Ys
>
__host__
__device__
explicit
constexpr
StaticallyIndexedArray
(
Ys
&&
...
ys
)
:
base
(
ys
...)
{
}
};
};
template
<
typename
TData
>
template
<
typename
TData
>
struct
StaticallyIndexedArray
<
TData
,
7
>
:
Tuple
<
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
>
struct
StaticallyIndexedArray
<
TData
,
7
>
:
Tuple
<
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
>
{
{
using
data_type
=
TData
;
using
data_type
=
TData
;
using
base
=
Tuple
<
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
>
;
template
<
typename
...
Ys
>
__host__
__device__
explicit
constexpr
StaticallyIndexedArray
(
Ys
&&
...
ys
)
:
base
(
ys
...)
{
}
};
};
template
<
typename
TData
>
template
<
typename
TData
>
...
@@ -65,6 +113,12 @@ struct StaticallyIndexedArray<TData, 8>
...
@@ -65,6 +113,12 @@ struct StaticallyIndexedArray<TData, 8>
:
Tuple
<
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
>
:
Tuple
<
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
>
{
{
using
data_type
=
TData
;
using
data_type
=
TData
;
using
base
=
Tuple
<
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
>
;
template
<
typename
...
Ys
>
__host__
__device__
explicit
constexpr
StaticallyIndexedArray
(
Ys
&&
...
ys
)
:
base
(
ys
...)
{
}
};
};
template
<
typename
TData
>
template
<
typename
TData
>
...
@@ -72,6 +126,12 @@ struct StaticallyIndexedArray<TData, 9>
...
@@ -72,6 +126,12 @@ struct StaticallyIndexedArray<TData, 9>
:
Tuple
<
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
>
:
Tuple
<
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
>
{
{
using
data_type
=
TData
;
using
data_type
=
TData
;
using
base
=
Tuple
<
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
>
;
template
<
typename
...
Ys
>
__host__
__device__
explicit
constexpr
StaticallyIndexedArray
(
Ys
&&
...
ys
)
:
base
(
ys
...)
{
}
};
};
template
<
typename
TData
>
template
<
typename
TData
>
...
@@ -79,6 +139,12 @@ struct StaticallyIndexedArray<TData, 10>
...
@@ -79,6 +139,12 @@ struct StaticallyIndexedArray<TData, 10>
:
Tuple
<
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
>
:
Tuple
<
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
>
{
{
using
data_type
=
TData
;
using
data_type
=
TData
;
using
base
=
Tuple
<
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
,
TData
>
;
template
<
typename
...
Ys
>
__host__
__device__
explicit
constexpr
StaticallyIndexedArray
(
Ys
&&
...
ys
)
:
base
(
ys
...)
{
}
};
};
template
<
typename
TData
>
template
<
typename
TData
>
...
...
driver/src/conv_driver.cpp
View file @
c98cbea0
...
@@ -561,7 +561,7 @@ int main(int argc, char* argv[])
...
@@ -561,7 +561,7 @@ int main(int argc, char* argv[])
LeftPads{},
LeftPads{},
RightPads{},
RightPads{},
nrepeat);
nrepeat);
#elif
1
#elif
0
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
in_nchw_desc
,
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
...
...
script/cmake-rocm3.7.sh
View file @
c98cbea0
...
@@ -10,15 +10,14 @@ cmake
...
@@ -10,15 +10,14 @@ cmake
-D
CMAKE_INSTALL_PREFIX
=
${
MY_PROJECT_INSTALL
}
\
-D
CMAKE_INSTALL_PREFIX
=
${
MY_PROJECT_INSTALL
}
\
-D
CMAKE_BUILD_TYPE
=
Debug
\
-D
CMAKE_BUILD_TYPE
=
Debug
\
-D
DEVICE_BACKEND
=
"AMD"
\
-D
DEVICE_BACKEND
=
"AMD"
\
-D
CMAKE_CXX_FLAGS
=
"-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=
$CWD
"
\
-D
CMAKE_CXX_FLAGS
=
"-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=
$CWD
"
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_PREFIX_PATH
=
"/opt/rocm"
\
-D
CMAKE_PREFIX_PATH
=
"/opt/rocm"
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
${
MY_PROJECT_SOURCE
}
${
MY_PROJECT_SOURCE
}
#-D CMAKE_CXX_FLAGS="-c -emit-llvm -O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0" \
#-D CMAKE_CXX_FLAGS="-gline-tables-only -S -emit-llvm -O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -v -gline-tables-only -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -v -gline-tables-only -save-temps=$CWD" \
script/hipclang_opt.sh
0 → 100755
View file @
c98cbea0
rm
*
.ll
*
.s
/opt/rocm/llvm/bin/llvm-dis driver/conv_driver-hip-amdgcn-amd-amdhsa-gfx906-optimized.bc
-o
tmp.ll
/opt/rocm/llvm/bin/opt
-S
-inline
-inline-threshold
=
104857 tmp.ll
>
inline.ll
/opt/rocm/llvm/bin/opt
-S
-O3
-sroa
inline.ll
>
o3.ll
/opt/rocm/llvm/bin/opt
-S
-O3
-sroa
o3.ll
>
o3_2.ll
/opt/rocm/llvm/bin/opt
-S
-O3
-sroa
o3_2.ll
>
o3_3.ll
/opt/rocm/llvm/bin/opt
-S
-O3
-sroa
o3_3.ll
>
o3_4.ll
/opt/rocm/llvm/bin/llc
-mcpu
=
gfx908 o3.ll
/opt/rocm/llvm/bin/llc
-mcpu
=
gfx908 o3_2.ll
/opt/rocm/llvm/bin/llc
-mcpu
=
gfx908 o3_3.ll
/opt/rocm/llvm/bin/llc
-mcpu
=
gfx908 o3_4.ll
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment