Commit c98cbea0 authored by Chao Liu's avatar Chao Liu
Browse files

refactoring array type

parent 674c405f
...@@ -68,17 +68,18 @@ map_convolution_into_gemm_v2(const WeiDesc& wei_k_c_y_x_global_desc, ...@@ -68,17 +68,18 @@ map_convolution_into_gemm_v2(const WeiDesc& wei_k_c_y_x_global_desc,
const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor_v2( const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor_v2(
in_n_c_hip_wip_global_desc, in_n_c_hip_wip_global_desc,
make_tuple(DynamicPassThrough{N}, make_tuple(
DynamicPassThrough{C}, DynamicPassThrough{N},
DynamicEmbed<2>{{Y, Ho}, {ConvDilationH, ConvStrideH, 0}}, DynamicPassThrough{C},
DynamicEmbed<2>{{X, Wo}, {ConvDilationW, ConvStrideW, 0}}), DynamicEmbed<2>{make_multi_index(Y, Ho), make_multi_index(ConvDilationH, ConvStrideH)},
DynamicEmbed<2>{make_multi_index(X, Wo), make_multi_index(ConvDilationW, ConvStrideW)}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor_v2( const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor_v2(
in_n_c_y_ho_x_wo_global_desc, in_n_c_y_ho_x_wo_global_desc,
make_tuple(DynamicMerge<3>{MultiIndex<3>{{C, Y, X}}}, make_tuple(DynamicMerge<3>{make_multi_index(C, Y, X)},
DynamicMerge<3>{MultiIndex<3>{{N, Ho, Wo}}}), DynamicMerge<3>{make_multi_index(N, Ho, Wo)}),
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}), make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{})); make_tuple(Sequence<0>{}, Sequence<1>{}));
......
...@@ -183,7 +183,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -183,7 +183,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
AddressSpace::Vgpr, AddressSpace::Vgpr,
AddressSpace::Lds, AddressSpace::Lds,
InMemoryDataOperation::Set>( InMemoryDataOperation::Set>(
MultiIndex<4>{{0, 0, b_block_data_on_global, 0}}, MultiIndex<4>{{0, 0, 0, 0}}); MultiIndex<4>{0, 0, b_block_data_on_global, 0}, MultiIndex<4>{0, 0, 0, 0});
// weight tensor // weight tensor
// global tensor in global memory, src of blockwise copy // global tensor in global memory, src of blockwise copy
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#define CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HPP #define CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HPP
#include "common_header.hpp" #include "common_header.hpp"
#include "multi_index.hpp"
namespace ck { namespace ck {
...@@ -13,11 +14,11 @@ struct DynamicPassThrough ...@@ -13,11 +14,11 @@ struct DynamicPassThrough
const UpperIndex up_lengths_; const UpperIndex up_lengths_;
__host__ __device__ explicit constexpr DynamicPassThrough(const index_t& low_length) __host__ __device__ explicit constexpr DynamicPassThrough(const index_t& low_length)
: up_lengths_{{low_length}} : up_lengths_{low_length}
{ {
} }
__host__ __device__ explicit constexpr DynamicPassThrough() : up_lengths_{{0}} {} __host__ __device__ explicit constexpr DynamicPassThrough() : up_lengths_{0} {}
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; } __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
...@@ -73,11 +74,11 @@ struct DynamicLeftPad ...@@ -73,11 +74,11 @@ struct DynamicLeftPad
__host__ __device__ explicit constexpr DynamicLeftPad(const index_t& low_length, __host__ __device__ explicit constexpr DynamicLeftPad(const index_t& low_length,
const index_t& left_pad) const index_t& left_pad)
: up_lengths_{{low_length + left_pad}}, left_pad_{left_pad} : up_lengths_{low_length + left_pad}, left_pad_{left_pad}
{ {
} }
__host__ __device__ explicit constexpr DynamicLeftPad() : up_lengths_{{0}}, left_pad_{0} {} __host__ __device__ explicit constexpr DynamicLeftPad() : up_lengths_{0}, left_pad_{0} {}
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; } __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
...@@ -136,12 +137,12 @@ struct DynamicRightPad ...@@ -136,12 +137,12 @@ struct DynamicRightPad
__host__ __device__ explicit constexpr DynamicRightPad(const index_t& low_length, __host__ __device__ explicit constexpr DynamicRightPad(const index_t& low_length,
const index_t& right_pad) const index_t& right_pad)
: up_lengths_{{low_length + right_pad}}, low_length_{low_length}, right_pad_{right_pad} : up_lengths_{low_length + right_pad}, low_length_{low_length}, right_pad_{right_pad}
{ {
} }
__host__ __device__ explicit constexpr DynamicRightPad() __host__ __device__ explicit constexpr DynamicRightPad()
: up_lengths_{{0}}, low_length_{0}, right_pad_{0} : up_lengths_{0}, low_length_{0}, right_pad_{0}
{ {
} }
...@@ -190,8 +191,7 @@ struct DynamicRightPad ...@@ -190,8 +191,7 @@ struct DynamicRightPad
} }
}; };
// idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1] + // idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1]
// coefficients[nDimUp]
template <index_t NDimUp> template <index_t NDimUp>
struct DynamicEmbed struct DynamicEmbed
{ {
...@@ -199,11 +199,10 @@ struct DynamicEmbed ...@@ -199,11 +199,10 @@ struct DynamicEmbed
using UpperIndex = MultiIndex<NDimUp>; using UpperIndex = MultiIndex<NDimUp>;
const UpperIndex up_lengths_; const UpperIndex up_lengths_;
const Array<index_t, NDimUp + 1> coefficients_; const UpperIndex coefficients_;
__host__ __host__ __device__ explicit constexpr DynamicEmbed(const UpperIndex& up_lengths,
__device__ explicit constexpr DynamicEmbed(const UpperIndex& up_lengths, const UpperIndex& coefficients)
const Array<index_t, NDimUp + 1>& coefficients)
: up_lengths_{up_lengths}, coefficients_{coefficients} : up_lengths_{up_lengths}, coefficients_{coefficients}
{ {
static_assert(UpperIndex::Size() == NDimUp, "wrong! # of dimensions not consistent"); static_assert(UpperIndex::Size() == NDimUp, "wrong! # of dimensions not consistent");
...@@ -211,7 +210,7 @@ struct DynamicEmbed ...@@ -211,7 +210,7 @@ struct DynamicEmbed
__host__ __device__ explicit constexpr DynamicEmbed() __host__ __device__ explicit constexpr DynamicEmbed()
: up_lengths_{make_zero_array<index_t, NDimUp>()}, : up_lengths_{make_zero_array<index_t, NDimUp>()},
coefficients_{make_zero_array<index_t, NDimUp + 1>()} coefficients_{make_zero_array<index_t, NDimUp>()}
{ {
} }
...@@ -228,7 +227,7 @@ struct DynamicEmbed ...@@ -228,7 +227,7 @@ struct DynamicEmbed
static_assert(LowIdx::Size() == 1 && UpIdx::Size() == NDimUp, static_assert(LowIdx::Size() == 1 && UpIdx::Size() == NDimUp,
"wrong! inconsistent # of dimension"); "wrong! inconsistent # of dimension");
idx_low(Number<0>{}) = coefficients_[Number<NDimUp>{}]; idx_low(Number<0>{}) = 0;
static_for<0, NDimUp, 1>{}([&idx_low, &idx_up, this](auto i) { static_for<0, NDimUp, 1>{}([&idx_low, &idx_up, this](auto i) {
idx_low(Number<0>{}) += idx_up[i] * this->coefficients_[i]; idx_low(Number<0>{}) += idx_up[i] * this->coefficients_[i];
...@@ -288,7 +287,7 @@ struct DynamicMerge ...@@ -288,7 +287,7 @@ struct DynamicMerge
__host__ __device__ explicit constexpr DynamicMerge() __host__ __device__ explicit constexpr DynamicMerge()
: low_lengths_{make_zero_array<index_t, NDimLow>()}, : low_lengths_{make_zero_array<index_t, NDimLow>()},
low_lengths_scan_{make_zero_array<index_t, NDimLow>()}, low_lengths_scan_{make_zero_array<index_t, NDimLow>()},
up_lengths_{{0}} up_lengths_{0}
{ {
} }
......
...@@ -31,9 +31,7 @@ template <index_t N> ...@@ -31,9 +31,7 @@ template <index_t N>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto
make_dynamic_native_tensor_descriptor_v2(const MultiIndex<N>& lengths, const MultiIndex<N>& strides) make_dynamic_native_tensor_descriptor_v2(const MultiIndex<N>& lengths, const MultiIndex<N>& strides)
{ {
const auto coefficients = strides.PushBack(index_t{0}); const auto transforms = make_tuple(DynamicEmbed<N>{lengths, strides});
const auto transforms = make_tuple(DynamicEmbed<N>{lengths, coefficients});
constexpr auto low_dim_hidden_idss = make_tuple(Sequence<0>{}); constexpr auto low_dim_hidden_idss = make_tuple(Sequence<0>{});
constexpr auto up_dim_hidden_idss = constexpr auto up_dim_hidden_idss =
make_tuple(typename arithmetic_sequence_gen<1, N + 1, 1>::type{}); make_tuple(typename arithmetic_sequence_gen<1, N + 1, 1>::type{});
...@@ -41,11 +39,7 @@ make_dynamic_native_tensor_descriptor_v2(const MultiIndex<N>& lengths, const Mul ...@@ -41,11 +39,7 @@ make_dynamic_native_tensor_descriptor_v2(const MultiIndex<N>& lengths, const Mul
index_t element_space_size = 1; index_t element_space_size = 1;
#pragma unroll static_for<0, N, 1>{}([&](auto i) { element_space_size += (lengths[i] - 1) * strides[i]; });
for(index_t i = 0; i < N; ++i)
{
element_space_size += (lengths[i] - 1) * strides[i];
}
return DynamicTensorDescriptor_v2<decltype(transforms), return DynamicTensorDescriptor_v2<decltype(transforms),
decltype(low_dim_hidden_idss), decltype(low_dim_hidden_idss),
......
#ifndef CK_MULTI_INDEX_HPP
#define CK_MULTI_INDEX_HPP
#include "common_header.hpp"
namespace ck {
#if 1 // debug
template <index_t N>
using MultiIndex = Array<index_t, N>;
template <typename... Xs>
__host__ __device__ constexpr auto make_multi_index(Xs... xs)
{
return MultiIndex<sizeof...(Xs)>{{static_cast<index_t>(xs)...}};
}
#else
template <index_t N>
using MultiIndex = StaticallyIndexedArray<index_t, N>;
template <typename... Xs>
__host__ __device__ constexpr auto make_multi_index(Xs... xs)
{
return MultiIndex<sizeof...(Xs)>(static_cast<index_t>(xs)... r);
}
#endif
template <index_t NSize>
__host__ __device__ constexpr auto make_zero_multi_index()
{
return unpack([](auto... xs) { return make_multi_index(xs...); },
typename uniform_sequence_gen<NSize, 0>::type{});
}
} // namespace ck
#endif
...@@ -2,24 +2,10 @@ ...@@ -2,24 +2,10 @@
#define CK_MULTI_INDEX_TRANSFORM_HPP #define CK_MULTI_INDEX_TRANSFORM_HPP
#include "common_header.hpp" #include "common_header.hpp"
#include "multi_index.hpp"
namespace ck { namespace ck {
template <index_t N>
using MultiIndex = Array<index_t, N>;
template <typename... Xs>
__host__ __device__ constexpr auto make_multi_index(Xs... xs)
{
return MultiIndex<sizeof...(Xs)>(xs...);
}
template <index_t NSize>
__host__ __device__ constexpr auto make_zero_multi_index()
{
return make_zero_array<index_t, NSize>();
}
template <index_t Length> template <index_t Length>
struct PassThrough struct PassThrough
{ {
......
...@@ -196,7 +196,7 @@ struct ClusterDescriptor ...@@ -196,7 +196,7 @@ struct ClusterDescriptor
__host__ __device__ static constexpr auto CalculateClusterIndex(index_t idx_1d) __host__ __device__ static constexpr auto CalculateClusterIndex(index_t idx_1d)
{ {
return mDesc.CalculateLowerIndex(MultiIndex<1>{{idx_1d}}); return mDesc.CalculateLowerIndex(MultiIndex<1>{idx_1d});
} }
}; };
......
...@@ -16,48 +16,96 @@ template <typename TData> ...@@ -16,48 +16,96 @@ template <typename TData>
struct StaticallyIndexedArray<TData, 0> : Tuple<> struct StaticallyIndexedArray<TData, 0> : Tuple<>
{ {
using data_type = TData; using data_type = TData;
using base = Tuple<>;
template <typename... Ys>
__host__ __device__ explicit constexpr StaticallyIndexedArray(Ys&&... ys) : base(ys...)
{
}
}; };
template <typename TData> template <typename TData>
struct StaticallyIndexedArray<TData, 1> : Tuple<TData> struct StaticallyIndexedArray<TData, 1> : Tuple<TData>
{ {
using data_type = TData; using data_type = TData;
using base = Tuple<TData>;
template <typename... Ys>
__host__ __device__ explicit constexpr StaticallyIndexedArray(Ys&&... ys) : base(ys...)
{
}
}; };
template <typename TData> template <typename TData>
struct StaticallyIndexedArray<TData, 2> : Tuple<TData, TData> struct StaticallyIndexedArray<TData, 2> : Tuple<TData, TData>
{ {
using data_type = TData; using data_type = TData;
using base = Tuple<TData, TData>;
template <typename... Ys>
__host__ __device__ explicit constexpr StaticallyIndexedArray(Ys&&... ys) : base(ys...)
{
}
}; };
template <typename TData> template <typename TData>
struct StaticallyIndexedArray<TData, 3> : Tuple<TData, TData, TData> struct StaticallyIndexedArray<TData, 3> : Tuple<TData, TData, TData>
{ {
using data_type = TData; using data_type = TData;
using base = Tuple<TData, TData, TData>;
template <typename... Ys>
__host__ __device__ explicit constexpr StaticallyIndexedArray(Ys&&... ys) : base(ys...)
{
}
}; };
template <typename TData> template <typename TData>
struct StaticallyIndexedArray<TData, 4> : Tuple<TData, TData, TData, TData> struct StaticallyIndexedArray<TData, 4> : Tuple<TData, TData, TData, TData>
{ {
using data_type = TData; using data_type = TData;
using base = Tuple<TData, TData, TData, TData>;
template <typename... Ys>
__host__ __device__ explicit constexpr StaticallyIndexedArray(Ys&&... ys) : base(ys...)
{
}
}; };
template <typename TData> template <typename TData>
struct StaticallyIndexedArray<TData, 5> : Tuple<TData, TData, TData, TData, TData> struct StaticallyIndexedArray<TData, 5> : Tuple<TData, TData, TData, TData, TData>
{ {
using data_type = TData; using data_type = TData;
using base = Tuple<TData, TData, TData, TData, TData>;
template <typename... Ys>
__host__ __device__ explicit constexpr StaticallyIndexedArray(Ys&&... ys) : base(ys...)
{
}
}; };
template <typename TData> template <typename TData>
struct StaticallyIndexedArray<TData, 6> : Tuple<TData, TData, TData, TData, TData, TData> struct StaticallyIndexedArray<TData, 6> : Tuple<TData, TData, TData, TData, TData, TData>
{ {
using data_type = TData; using data_type = TData;
using base = Tuple<TData, TData, TData, TData, TData, TData>;
template <typename... Ys>
__host__ __device__ explicit constexpr StaticallyIndexedArray(Ys&&... ys) : base(ys...)
{
}
}; };
template <typename TData> template <typename TData>
struct StaticallyIndexedArray<TData, 7> : Tuple<TData, TData, TData, TData, TData, TData, TData> struct StaticallyIndexedArray<TData, 7> : Tuple<TData, TData, TData, TData, TData, TData, TData>
{ {
using data_type = TData; using data_type = TData;
using base = Tuple<TData, TData, TData, TData, TData, TData, TData>;
template <typename... Ys>
__host__ __device__ explicit constexpr StaticallyIndexedArray(Ys&&... ys) : base(ys...)
{
}
}; };
template <typename TData> template <typename TData>
...@@ -65,6 +113,12 @@ struct StaticallyIndexedArray<TData, 8> ...@@ -65,6 +113,12 @@ struct StaticallyIndexedArray<TData, 8>
: Tuple<TData, TData, TData, TData, TData, TData, TData, TData> : Tuple<TData, TData, TData, TData, TData, TData, TData, TData>
{ {
using data_type = TData; using data_type = TData;
using base = Tuple<TData, TData, TData, TData, TData, TData, TData, TData>;
template <typename... Ys>
__host__ __device__ explicit constexpr StaticallyIndexedArray(Ys&&... ys) : base(ys...)
{
}
}; };
template <typename TData> template <typename TData>
...@@ -72,6 +126,12 @@ struct StaticallyIndexedArray<TData, 9> ...@@ -72,6 +126,12 @@ struct StaticallyIndexedArray<TData, 9>
: Tuple<TData, TData, TData, TData, TData, TData, TData, TData, TData> : Tuple<TData, TData, TData, TData, TData, TData, TData, TData, TData>
{ {
using data_type = TData; using data_type = TData;
using base = Tuple<TData, TData, TData, TData, TData, TData, TData, TData, TData>;
template <typename... Ys>
__host__ __device__ explicit constexpr StaticallyIndexedArray(Ys&&... ys) : base(ys...)
{
}
}; };
template <typename TData> template <typename TData>
...@@ -79,6 +139,12 @@ struct StaticallyIndexedArray<TData, 10> ...@@ -79,6 +139,12 @@ struct StaticallyIndexedArray<TData, 10>
: Tuple<TData, TData, TData, TData, TData, TData, TData, TData, TData, TData> : Tuple<TData, TData, TData, TData, TData, TData, TData, TData, TData, TData>
{ {
using data_type = TData; using data_type = TData;
using base = Tuple<TData, TData, TData, TData, TData, TData, TData, TData, TData, TData>;
template <typename... Ys>
__host__ __device__ explicit constexpr StaticallyIndexedArray(Ys&&... ys) : base(ys...)
{
}
}; };
template <typename TData> template <typename TData>
......
...@@ -561,7 +561,7 @@ int main(int argc, char* argv[]) ...@@ -561,7 +561,7 @@ int main(int argc, char* argv[])
LeftPads{}, LeftPads{},
RightPads{}, RightPads{},
nrepeat); nrepeat);
#elif 1 #elif 0
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc, device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
......
...@@ -10,15 +10,14 @@ cmake ...@@ -10,15 +10,14 @@ cmake
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \ -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D CMAKE_BUILD_TYPE=Debug \ -D CMAKE_BUILD_TYPE=Debug \
-D DEVICE_BACKEND="AMD" \ -D DEVICE_BACKEND="AMD" \
-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \ -D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH="/opt/rocm" \ -D CMAKE_PREFIX_PATH="/opt/rocm" \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
${MY_PROJECT_SOURCE} ${MY_PROJECT_SOURCE}
#-D CMAKE_CXX_FLAGS="-c -emit-llvm -O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \ #-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0" \
#-D CMAKE_CXX_FLAGS="-gline-tables-only -S -emit-llvm -O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \ #-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0" \ #-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \ #-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -v -gline-tables-only -save-temps=$CWD" \ #-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -v -gline-tables-only -save-temps=$CWD" \
rm *.ll *.s
/opt/rocm/llvm/bin/llvm-dis driver/conv_driver-hip-amdgcn-amd-amdhsa-gfx906-optimized.bc -o tmp.ll
/opt/rocm/llvm/bin/opt -S -inline -inline-threshold=104857 tmp.ll > inline.ll
/opt/rocm/llvm/bin/opt -S -O3 -sroa inline.ll > o3.ll
/opt/rocm/llvm/bin/opt -S -O3 -sroa o3.ll > o3_2.ll
/opt/rocm/llvm/bin/opt -S -O3 -sroa o3_2.ll > o3_3.ll
/opt/rocm/llvm/bin/opt -S -O3 -sroa o3_3.ll > o3_4.ll
/opt/rocm/llvm/bin/llc -mcpu=gfx908 o3.ll
/opt/rocm/llvm/bin/llc -mcpu=gfx908 o3_2.ll
/opt/rocm/llvm/bin/llc -mcpu=gfx908 o3_3.ll
/opt/rocm/llvm/bin/llc -mcpu=gfx908 o3_4.ll
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment