Commit b134b7d6 authored by carlushuang's avatar carlushuang
Browse files

Merge remote-tracking branch 'origin/develop' into cpu_avx2

parents 090ba885 9f71ff48
...@@ -211,7 +211,8 @@ struct ReductionHost ...@@ -211,7 +211,8 @@ struct ReductionHost
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>(); AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
IndexDataType accuIndex = 0; IndexDataType accuIndex = 0;
for(IndexDataType i = 0; i < reduce_dim_indexes.size(); i++) for(IndexDataType i = 0; i < ck::type_convert<IndexDataType>(reduce_dim_indexes.size());
i++)
{ {
auto offset_reduce = auto offset_reduce =
get_offset_from_index<NumReduceDim>(reduceStrides, reduce_dim_indexes[i]); get_offset_from_index<NumReduceDim>(reduceStrides, reduce_dim_indexes[i]);
...@@ -246,7 +247,9 @@ struct ReductionHost ...@@ -246,7 +247,9 @@ struct ReductionHost
auto offset_invariant = auto offset_invariant =
get_offset_from_index<NumInvariantDim>(invariantStrides, invariant_index); get_offset_from_index<NumInvariantDim>(invariantStrides, invariant_index);
for(IndexDataType i = 0; i < reduce_dim_indexes.size(); i++) for(IndexDataType i = 0;
i < ck::type_convert<IndexDataType>(reduce_dim_indexes.size());
i++)
{ {
auto offset_reduce = auto offset_reduce =
get_offset_from_index<NumReduceDim>(reduceStrides, reduce_dim_indexes[i]); get_offset_from_index<NumReduceDim>(reduceStrides, reduce_dim_indexes[i]);
......
...@@ -154,7 +154,7 @@ struct ParallelTensorFunctor ...@@ -154,7 +154,7 @@ struct ParallelTensorFunctor
{ {
std::array<std::size_t, NDIM> indices; std::array<std::size_t, NDIM> indices;
for(int idim = 0; idim < NDIM; ++idim) for(std::size_t idim = 0; idim < NDIM; ++idim)
{ {
indices[idim] = i / mStrides[idim]; indices[idim] = i / mStrides[idim];
i -= indices[idim] * mStrides[idim]; i -= indices[idim] * mStrides[idim];
...@@ -316,7 +316,7 @@ float check_error(const Tensor<T>& ref, const Tensor<T>& result) ...@@ -316,7 +316,7 @@ float check_error(const Tensor<T>& ref, const Tensor<T>& result)
constexpr float eps = 1e-10; constexpr float eps = 1e-10;
for(int i = 0; i < ref.mData.size(); ++i) for(std::size_t i = 0; i < ref.mData.size(); ++i)
{ {
float ref_v = ck::type_convert<float>(ref.mData[i]); float ref_v = ck::type_convert<float>(ref.mData[i]);
float result_v = ck::type_convert<float>(result.mData[i]); float result_v = ck::type_convert<float>(result.mData[i]);
......
...@@ -84,7 +84,8 @@ struct ReferenceBatchedGemm : public device::BaseOperator ...@@ -84,7 +84,8 @@ struct ReferenceBatchedGemm : public device::BaseOperator
return 0; return 0;
} }
float Run(const device::BaseArgument* p_arg, int) override float Run(const device::BaseArgument* p_arg,
const StreamConfig& /* stream_config */ = StreamConfig{}) override
{ {
return Run(*dynamic_cast<const Argument*>(p_arg)); return Run(*dynamic_cast<const Argument*>(p_arg));
} }
......
...@@ -70,18 +70,25 @@ struct ReferenceConvBwdWeight : public device::BaseOperator ...@@ -70,18 +70,25 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
auto f_kcyx = [&](auto k, auto c, auto y, auto x) { auto f_kcyx = [&](auto k, auto c, auto y, auto x) {
float v_acc = 0; float v_acc = 0;
for(int n = 0; n < arg.out_n_k_ho_wo_.mDesc.GetLengths()[0]; ++n) for(std::size_t n = 0; n < arg.out_n_k_ho_wo_.mDesc.GetLengths()[0]; ++n)
{ {
for(int ho = 0; ho < arg.out_n_k_ho_wo_.mDesc.GetLengths()[2]; ++ho) for(std::size_t ho = 0; ho < arg.out_n_k_ho_wo_.mDesc.GetLengths()[2]; ++ho)
{ {
int hi = ho * arg.conv_strides_[I0] + y * arg.conv_dilations_[I0] - auto hi = ck::type_convert<ck::long_index_t>(ho * arg.conv_strides_[I0]) +
arg.in_left_pads_[I0]; ck::type_convert<ck::long_index_t>(y * arg.conv_dilations_[I0]) -
for(int wo = 0; wo < arg.out_n_k_ho_wo_.mDesc.GetLengths()[3]; ++wo) ck::type_convert<ck::long_index_t>(arg.in_left_pads_[I0]);
for(std::size_t wo = 0; wo < arg.out_n_k_ho_wo_.mDesc.GetLengths()[3]; ++wo)
{ {
int wi = wo * arg.conv_strides_[I1] + x * arg.conv_dilations_[I1] - auto wi =
arg.in_left_pads_[I1]; ck::type_convert<ck::long_index_t>(wo * arg.conv_strides_[I1]) +
if(hi >= 0 && hi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && wi >= 0 && ck::type_convert<ck::long_index_t>(x * arg.conv_dilations_[I1]) -
wi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[3]) ck::type_convert<ck::long_index_t>(arg.in_left_pads_[I1]);
if(hi >= 0 &&
ck::type_convert<std::size_t>(hi) <
arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] &&
wi >= 0 &&
ck::type_convert<std::size_t>(wi) <
arg.in_n_c_hi_wi_.mDesc.GetLengths()[3])
{ {
float v_out; float v_out;
float v_in; float v_in;
...@@ -114,7 +121,8 @@ struct ReferenceConvBwdWeight : public device::BaseOperator ...@@ -114,7 +121,8 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
return 0; return 0;
} }
float Run(const device::BaseArgument* p_arg, int) override float Run(const device::BaseArgument* p_arg,
const StreamConfig& /*stream_config*/ = StreamConfig{}) override
{ {
return Run(*dynamic_cast<const Argument*>(p_arg)); return Run(*dynamic_cast<const Argument*>(p_arg));
} }
......
...@@ -78,15 +78,18 @@ struct ReferenceConvBwdData : public device::BaseOperator ...@@ -78,15 +78,18 @@ struct ReferenceConvBwdData : public device::BaseOperator
AccDataType v_acc = 0; AccDataType v_acc = 0;
for(int x = 0; x < X; ++x) for(std::size_t x = 0; x < X; ++x)
{ {
int w_tmp = wi + arg.in_left_pads_[0] - x * arg.conv_dilations_[0]; auto w_tmp = ck::type_convert<ck::long_index_t>(wi) +
ck::type_convert<ck::long_index_t>(arg.in_left_pads_[0]) -
ck::type_convert<ck::long_index_t>(x * arg.conv_dilations_[0]);
if(w_tmp % arg.conv_strides_[0] == 0) if(w_tmp % arg.conv_strides_[0] == 0)
{ {
int wo = w_tmp / arg.conv_strides_[0]; auto wo = ck::type_convert<ck::long_index_t>(w_tmp) /
if(wo >= 0 && wo < Wo) ck::type_convert<ck::long_index_t>(arg.conv_strides_[0]);
if(wo >= 0 && ck::type_convert<std::size_t>(wo) < Wo)
{ {
for(int k = 0; k < K; ++k) for(std::size_t k = 0; k < K; ++k)
{ {
AccDataType v_out = 0; AccDataType v_out = 0;
AccDataType v_wei = 0; AccDataType v_wei = 0;
...@@ -128,24 +131,32 @@ struct ReferenceConvBwdData : public device::BaseOperator ...@@ -128,24 +131,32 @@ struct ReferenceConvBwdData : public device::BaseOperator
AccDataType v_acc = 0; AccDataType v_acc = 0;
for(int y = 0; y < Y; ++y) for(std::size_t y = 0; y < Y; ++y)
{ {
int h_tmp = hi + arg.in_left_pads_[0] - y * arg.conv_dilations_[0]; auto h_tmp = ck::type_convert<ck::long_index_t>(hi) +
ck::type_convert<ck::long_index_t>(arg.in_left_pads_[0]) -
ck::type_convert<ck::long_index_t>(y * arg.conv_dilations_[0]);
if(h_tmp % arg.conv_strides_[0] == 0) if(h_tmp % arg.conv_strides_[0] == 0)
{ {
int ho = h_tmp / arg.conv_strides_[0]; auto ho = ck::type_convert<ck::long_index_t>(h_tmp) /
if(ho >= 0 && ho < Ho) ck::type_convert<ck::long_index_t>(arg.conv_strides_[0]);
if(ho >= 0 && ck::type_convert<std::size_t>(ho) < Ho)
{ {
for(int x = 0; x < X; ++x) for(std::size_t x = 0; x < X; ++x)
{ {
int w_tmp = auto w_tmp =
wi + arg.in_left_pads_[1] - x * arg.conv_dilations_[1]; ck::type_convert<ck::long_index_t>(wi) +
ck::type_convert<ck::long_index_t>(arg.in_left_pads_[1]) -
ck::type_convert<ck::long_index_t>(x *
arg.conv_dilations_[1]);
if(w_tmp % arg.conv_strides_[1] == 0) if(w_tmp % arg.conv_strides_[1] == 0)
{ {
int wo = w_tmp / arg.conv_strides_[1]; auto wo = ck::type_convert<ck::long_index_t>(w_tmp) /
if(wo >= 0 && wo < Wo) ck::type_convert<ck::long_index_t>(
arg.conv_strides_[1]);
if(wo >= 0 && ck::type_convert<std::size_t>(wo) < Wo)
{ {
for(int k = 0; k < K; ++k) for(std::size_t k = 0; k < K; ++k)
{ {
AccDataType v_out = 0; AccDataType v_out = 0;
AccDataType v_wei = 0; AccDataType v_wei = 0;
...@@ -194,33 +205,49 @@ struct ReferenceConvBwdData : public device::BaseOperator ...@@ -194,33 +205,49 @@ struct ReferenceConvBwdData : public device::BaseOperator
AccDataType v_acc = 0; AccDataType v_acc = 0;
for(int z = 0; z < Z; ++z) for(std::size_t z = 0; z < Z; ++z)
{ {
int d_tmp = di + arg.in_left_pads_[0] - z * arg.conv_dilations_[0]; auto d_tmp = ck::type_convert<ck::long_index_t>(di) +
ck::type_convert<ck::long_index_t>(arg.in_left_pads_[0]) -
ck::type_convert<ck::long_index_t>(z * arg.conv_dilations_[0]);
if(d_tmp % arg.conv_strides_[0] == 0) if(d_tmp % arg.conv_strides_[0] == 0)
{ {
int do_ = d_tmp / arg.conv_strides_[0]; auto do_ = ck::type_convert<ck::long_index_t>(d_tmp) /
if(do_ >= 0 && do_ < Do) ck::type_convert<ck::long_index_t>(arg.conv_strides_[0]);
if(do_ >= 0 && ck::type_convert<std::size_t>(do_) < Do)
{ {
for(int y = 0; y < Y; ++y) for(std::size_t y = 0; y < Y; ++y)
{ {
int h_tmp = auto h_tmp =
hi + arg.in_left_pads_[1] - y * arg.conv_dilations_[1]; ck::type_convert<ck::long_index_t>(hi) +
ck::type_convert<ck::long_index_t>(arg.in_left_pads_[1]) -
ck::type_convert<ck::long_index_t>(y *
arg.conv_dilations_[1]);
if(h_tmp % arg.conv_strides_[1] == 0) if(h_tmp % arg.conv_strides_[1] == 0)
{ {
int ho = h_tmp / arg.conv_strides_[1]; auto ho = ck::type_convert<ck::long_index_t>(h_tmp) /
if(ho >= 0 && ho < Ho) ck::type_convert<ck::long_index_t>(
arg.conv_strides_[1]);
if(ho >= 0 && ck::type_convert<std::size_t>(ho) < Ho)
{ {
for(int x = 0; x < X; ++x) for(std::size_t x = 0; x < X; ++x)
{ {
int w_tmp = wi + arg.in_left_pads_[2] - auto w_tmp =
x * arg.conv_dilations_[2]; ck::type_convert<ck::long_index_t>(wi) +
ck::type_convert<ck::long_index_t>(
arg.in_left_pads_[2]) -
ck::type_convert<ck::long_index_t>(
x * arg.conv_dilations_[2]);
if(w_tmp % arg.conv_strides_[2] == 0) if(w_tmp % arg.conv_strides_[2] == 0)
{ {
int wo = w_tmp / arg.conv_strides_[2]; auto wo =
if(wo >= 0 && wo < Wo) ck::type_convert<ck::long_index_t>(w_tmp) /
ck::type_convert<ck::long_index_t>(
arg.conv_strides_[2]);
if(wo >= 0 &&
ck::type_convert<std::size_t>(wo) < Wo)
{ {
for(int k = 0; k < K; ++k) for(std::size_t k = 0; k < K; ++k)
{ {
AccDataType v_out = 0; AccDataType v_out = 0;
AccDataType v_wei = 0; AccDataType v_wei = 0;
...@@ -264,7 +291,8 @@ struct ReferenceConvBwdData : public device::BaseOperator ...@@ -264,7 +291,8 @@ struct ReferenceConvBwdData : public device::BaseOperator
} }
} }
float Run(const device::BaseArgument* p_arg, int) override float Run(const device::BaseArgument* p_arg,
const StreamConfig& /* stream_config */ = StreamConfig{}) override
{ {
return Run(*dynamic_cast<const Argument*>(p_arg)); return Run(*dynamic_cast<const Argument*>(p_arg));
} }
......
#ifndef REFERENCE_CONV_FWD_HPP #pragma once
#define REFERENCE_CONV_FWD_HPP
#include <iostream> #include <iostream>
#include <type_traits> #include <type_traits>
#include <sstream> #include <sstream>
#include "stream_config.hpp"
#include "device_base.hpp" #include "device_base.hpp"
#include "host_tensor.hpp" #include "host_tensor.hpp"
...@@ -88,13 +89,16 @@ struct ReferenceConvFwd : public device::BaseOperator ...@@ -88,13 +89,16 @@ struct ReferenceConvFwd : public device::BaseOperator
auto f_ncw = [&](auto n, auto k, auto wo) { auto f_ncw = [&](auto n, auto k, auto wo) {
float v_acc = 0; float v_acc = 0;
for(int c = 0; c < arg.weight_.mDesc.GetLengths()[1]; ++c) for(std::size_t c = 0; c < arg.weight_.mDesc.GetLengths()[1]; ++c)
{ {
for(int x = 0; x < arg.weight_.mDesc.GetLengths()[2]; ++x) for(std::size_t x = 0; x < arg.weight_.mDesc.GetLengths()[2]; ++x)
{ {
int wi = wo * arg.conv_strides_[0] + x * arg.conv_dilations_[0] - auto wi =
arg.in_left_pads_[0]; ck::type_convert<ck::long_index_t>(wo * arg.conv_strides_[0]) +
if(wi >= 0 && wi < arg.input_.mDesc.GetLengths()[2]) ck::type_convert<ck::long_index_t>(x * arg.conv_dilations_[0]) -
ck::type_convert<ck::long_index_t>(arg.in_left_pads_[0]);
if(wi >= 0 &&
ck::type_convert<std::size_t>(wi) < arg.input_.mDesc.GetLengths()[2])
{ {
float v_in; float v_in;
float v_wei; float v_wei;
...@@ -128,18 +132,26 @@ struct ReferenceConvFwd : public device::BaseOperator ...@@ -128,18 +132,26 @@ struct ReferenceConvFwd : public device::BaseOperator
auto f_nchw = [&](auto n, auto k, auto ho, auto wo) { auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
float v_acc = 0; float v_acc = 0;
for(int c = 0; c < arg.weight_.mDesc.GetLengths()[1]; ++c) for(std::size_t c = 0; c < arg.weight_.mDesc.GetLengths()[1]; ++c)
{ {
for(int y = 0; y < arg.weight_.mDesc.GetLengths()[2]; ++y) for(std::size_t y = 0; y < arg.weight_.mDesc.GetLengths()[2]; ++y)
{ {
int hi = ho * arg.conv_strides_[0] + y * arg.conv_dilations_[0] - auto hi =
arg.in_left_pads_[0]; ck::type_convert<ck::long_index_t>(ho * arg.conv_strides_[0]) +
for(int x = 0; x < arg.weight_.mDesc.GetLengths()[3]; ++x) ck::type_convert<ck::long_index_t>(y * arg.conv_dilations_[0]) -
ck::type_convert<ck::long_index_t>(arg.in_left_pads_[0]);
for(std::size_t x = 0; x < arg.weight_.mDesc.GetLengths()[3]; ++x)
{ {
int wi = wo * arg.conv_strides_[1] + x * arg.conv_dilations_[1] - auto wi =
arg.in_left_pads_[1]; ck::type_convert<ck::long_index_t>(wo * arg.conv_strides_[1]) +
if(hi >= 0 && hi < arg.input_.mDesc.GetLengths()[2] && wi >= 0 && ck::type_convert<ck::long_index_t>(x * arg.conv_dilations_[1]) -
wi < arg.input_.mDesc.GetLengths()[3]) ck::type_convert<ck::long_index_t>(arg.in_left_pads_[1]);
if(hi >= 0 &&
ck::type_convert<std::size_t>(hi) <
arg.input_.mDesc.GetLengths()[2] &&
wi >= 0 &&
ck::type_convert<std::size_t>(wi) <
arg.input_.mDesc.GetLengths()[3])
{ {
float v_in; float v_in;
float v_wei; float v_wei;
...@@ -174,23 +186,37 @@ struct ReferenceConvFwd : public device::BaseOperator ...@@ -174,23 +186,37 @@ struct ReferenceConvFwd : public device::BaseOperator
auto f_nchw = [&](auto n, auto k, auto d_o, auto ho, auto wo) { auto f_nchw = [&](auto n, auto k, auto d_o, auto ho, auto wo) {
float v_acc = 0; float v_acc = 0;
for(int c = 0; c < arg.weight_.mDesc.GetLengths()[1]; ++c) for(std::size_t c = 0; c < arg.weight_.mDesc.GetLengths()[1]; ++c)
{ {
for(int z = 0; z < arg.weight_.mDesc.GetLengths()[2]; ++z) for(std::size_t z = 0; z < arg.weight_.mDesc.GetLengths()[2]; ++z)
{ {
int di = d_o * arg.conv_strides_[0] + z * arg.conv_dilations_[0] - auto di =
arg.in_left_pads_[0]; ck::type_convert<ck::long_index_t>(d_o * arg.conv_strides_[0]) +
for(int y = 0; y < arg.weight_.mDesc.GetLengths()[3]; ++y) ck::type_convert<ck::long_index_t>(z * arg.conv_dilations_[0]) -
ck::type_convert<ck::long_index_t>(arg.in_left_pads_[0]);
for(std::size_t y = 0; y < arg.weight_.mDesc.GetLengths()[3]; ++y)
{ {
int hi = ho * arg.conv_strides_[1] + y * arg.conv_dilations_[1] - auto hi =
arg.in_left_pads_[1]; ck::type_convert<ck::long_index_t>(ho * arg.conv_strides_[1]) +
for(int x = 0; x < arg.weight_.mDesc.GetLengths()[4]; ++x) ck::type_convert<ck::long_index_t>(y * arg.conv_dilations_[1]) -
ck::type_convert<ck::long_index_t>(arg.in_left_pads_[1]);
for(std::size_t x = 0; x < arg.weight_.mDesc.GetLengths()[4]; ++x)
{ {
int wi = wo * arg.conv_strides_[2] + auto wi =
x * arg.conv_dilations_[2] - arg.in_left_pads_[2]; ck::type_convert<ck::long_index_t>(wo *
if(di >= 0 && di < arg.input_.mDesc.GetLengths()[2] && arg.conv_strides_[2]) +
hi >= 0 && hi < arg.input_.mDesc.GetLengths()[3] && ck::type_convert<ck::long_index_t>(x *
wi >= 0 && wi < arg.input_.mDesc.GetLengths()[4]) arg.conv_dilations_[2]) -
ck::type_convert<ck::long_index_t>(arg.in_left_pads_[2]);
if(di >= 0 &&
ck::type_convert<std::size_t>(di) <
arg.input_.mDesc.GetLengths()[2] &&
hi >= 0 &&
ck::type_convert<std::size_t>(hi) <
arg.input_.mDesc.GetLengths()[3] &&
wi >= 0 &&
ck::type_convert<std::size_t>(wi) <
arg.input_.mDesc.GetLengths()[4])
{ {
float v_in; float v_in;
float v_wei; float v_wei;
...@@ -226,7 +252,8 @@ struct ReferenceConvFwd : public device::BaseOperator ...@@ -226,7 +252,8 @@ struct ReferenceConvFwd : public device::BaseOperator
} }
} }
float Run(const device::BaseArgument* p_arg, int) override float Run(const device::BaseArgument* p_arg,
const StreamConfig& /*stream_config*/ = StreamConfig{}) override
{ {
return Run(*dynamic_cast<const Argument*>(p_arg)); return Run(*dynamic_cast<const Argument*>(p_arg));
} }
...@@ -286,4 +313,3 @@ struct ReferenceConvFwd : public device::BaseOperator ...@@ -286,4 +313,3 @@ struct ReferenceConvFwd : public device::BaseOperator
} // namespace host } // namespace host
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
...@@ -73,18 +73,25 @@ struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator ...@@ -73,18 +73,25 @@ struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator
auto f_nchw = [&](auto n, auto k, auto ho, auto wo) { auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
float v_acc = 0; float v_acc = 0;
for(int c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c) for(std::size_t c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c)
{ {
for(int y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y) for(std::size_t y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y)
{ {
int hi = ho * arg.conv_strides_[0] + y * arg.conv_dilations_[0] - auto hi = ck::type_convert<ck::long_index_t>(ho * arg.conv_strides_[0]) +
arg.in_left_pads_[0]; ck::type_convert<ck::long_index_t>(y * arg.conv_dilations_[0]) -
for(int x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x) ck::type_convert<ck::long_index_t>(arg.in_left_pads_[0]);
for(std::size_t x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x)
{ {
int wi = wo * arg.conv_strides_[1] + x * arg.conv_dilations_[1] - auto wi =
arg.in_left_pads_[1]; ck::type_convert<ck::long_index_t>(wo * arg.conv_strides_[1]) +
if(hi >= 0 && hi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && wi >= 0 && ck::type_convert<ck::long_index_t>(x * arg.conv_dilations_[1]) -
wi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[3]) ck::type_convert<ck::long_index_t>(arg.in_left_pads_[1]);
if(hi >= 0 &&
ck::type_convert<std::size_t>(hi) <
arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] &&
wi >= 0 &&
ck::type_convert<std::size_t>(wi) <
arg.in_n_c_hi_wi_.mDesc.GetLengths()[3])
{ {
float v_in; float v_in;
float v_wei; float v_wei;
...@@ -117,7 +124,8 @@ struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator ...@@ -117,7 +124,8 @@ struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator
return 0; return 0;
} }
float Run(const device::BaseArgument* p_arg, int) override float Run(const device::BaseArgument* p_arg,
const StreamConfig& /* stream_config */ = StreamConfig{}) override
{ {
return Run(*dynamic_cast<const Argument*>(p_arg)); return Run(*dynamic_cast<const Argument*>(p_arg));
} }
......
...@@ -76,18 +76,25 @@ struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator ...@@ -76,18 +76,25 @@ struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator
auto f_nchw = [&](auto n, auto k, auto ho, auto wo) { auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
float v_acc = 0; float v_acc = 0;
for(int c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c) for(std::size_t c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c)
{ {
for(int y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y) for(std::size_t y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y)
{ {
int hi = ho * arg.conv_strides_[0] + y * arg.conv_dilations_[0] - auto hi = ck::type_convert<ck::long_index_t>(ho * arg.conv_strides_[0]) +
arg.in_left_pads_[0]; ck::type_convert<ck::long_index_t>(y * arg.conv_dilations_[0]) -
for(int x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x) ck::type_convert<ck::long_index_t>(arg.in_left_pads_[0]);
for(std::size_t x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x)
{ {
int wi = wo * arg.conv_strides_[1] + x * arg.conv_dilations_[1] - auto wi =
arg.in_left_pads_[1]; ck::type_convert<ck::long_index_t>(wo * arg.conv_strides_[1]) +
if(hi >= 0 && hi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && wi >= 0 && ck::type_convert<ck::long_index_t>(x * arg.conv_dilations_[1]) -
wi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[3]) ck::type_convert<ck::long_index_t>(arg.in_left_pads_[1]);
if(hi >= 0 &&
ck::type_convert<std::size_t>(hi) <
arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] &&
wi >= 0 &&
ck::type_convert<std::size_t>(wi) <
arg.in_n_c_hi_wi_.mDesc.GetLengths()[3])
{ {
float v_in; float v_in;
float v_wei; float v_wei;
...@@ -123,7 +130,8 @@ struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator ...@@ -123,7 +130,8 @@ struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator
return 0; return 0;
} }
float Run(const device::BaseArgument* p_arg, int) override float Run(const device::BaseArgument* p_arg,
const StreamConfig& /*stream_config*/ = StreamConfig{}) override
{ {
return Run(*dynamic_cast<const Argument*>(p_arg)); return Run(*dynamic_cast<const Argument*>(p_arg));
} }
......
#ifndef REFERENCE_GEMM_HPP #pragma once
#define REFERENCE_GEMM_HPP
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include "device_base.hpp" #include "device_base.hpp"
...@@ -82,7 +80,8 @@ struct ReferenceGemm : public device::BaseOperator ...@@ -82,7 +80,8 @@ struct ReferenceGemm : public device::BaseOperator
return 0; return 0;
} }
float Run(const device::BaseArgument* p_arg, int) override float Run(const device::BaseArgument* p_arg,
const StreamConfig& /* stream_config */ = StreamConfig{}) override
{ {
return Run(*dynamic_cast<const Argument*>(p_arg)); return Run(*dynamic_cast<const Argument*>(p_arg));
} }
...@@ -129,4 +128,3 @@ struct ReferenceGemm : public device::BaseOperator ...@@ -129,4 +128,3 @@ struct ReferenceGemm : public device::BaseOperator
} // namespace host } // namespace host
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
...@@ -82,7 +82,8 @@ struct ReferenceGemmBias2D : public device::BaseOperator ...@@ -82,7 +82,8 @@ struct ReferenceGemmBias2D : public device::BaseOperator
return 0; return 0;
} }
float Run(const device::BaseArgument* p_arg, int) override float Run(const device::BaseArgument* p_arg,
const StreamConfig& /* stream_config */ = StreamConfig{}) override
{ {
return Run(*dynamic_cast<const Argument*>(p_arg)); return Run(*dynamic_cast<const Argument*>(p_arg));
} }
......
...@@ -85,7 +85,8 @@ struct ReferenceGemmBiasActivation : public device::BaseOperator ...@@ -85,7 +85,8 @@ struct ReferenceGemmBiasActivation : public device::BaseOperator
return 0; return 0;
} }
float Run(const device::BaseArgument* p_arg, int) override float Run(const device::BaseArgument* p_arg,
const StreamConfig& /* stream_config */ = StreamConfig{}) override
{ {
return Run(*dynamic_cast<const Argument*>(p_arg)); return Run(*dynamic_cast<const Argument*>(p_arg));
} }
......
...@@ -91,7 +91,8 @@ struct ReferenceGemmBiasActivationAdd : public device::BaseOperator ...@@ -91,7 +91,8 @@ struct ReferenceGemmBiasActivationAdd : public device::BaseOperator
return 0; return 0;
} }
float Run(const device::BaseArgument* p_arg, int) override float Run(const device::BaseArgument* p_arg,
const StreamConfig& /* stream_config */ = StreamConfig{}) override
{ {
return Run(*dynamic_cast<const Argument*>(p_arg)); return Run(*dynamic_cast<const Argument*>(p_arg));
} }
......
...@@ -146,19 +146,19 @@ struct ConvParams ...@@ -146,19 +146,19 @@ struct ConvParams
const std::vector<ck::index_t>& left_pads, const std::vector<ck::index_t>& left_pads,
const std::vector<ck::index_t>& right_pads); const std::vector<ck::index_t>& right_pads);
ck::index_t num_dim_spatial; ck::index_t num_dim_spatial_;
ck::index_t N; ck::index_t N_;
ck::index_t K; ck::index_t K_;
ck::index_t C; ck::index_t C_;
std::vector<ck::index_t> filter_spatial_lengths; std::vector<ck::index_t> filter_spatial_lengths_;
std::vector<ck::index_t> input_spatial_lengths; std::vector<ck::index_t> input_spatial_lengths_;
std::vector<ck::index_t> conv_filter_strides; std::vector<ck::index_t> conv_filter_strides_;
std::vector<ck::index_t> conv_filter_dilations; std::vector<ck::index_t> conv_filter_dilations_;
std::vector<ck::index_t> input_left_pads; std::vector<ck::index_t> input_left_pads_;
std::vector<ck::index_t> input_right_pads; std::vector<ck::index_t> input_right_pads_;
std::vector<ck::index_t> GetOutputSpatialLengths() const; std::vector<ck::index_t> GetOutputSpatialLengths() const;
}; };
...@@ -268,10 +268,10 @@ void run_reference_convolution_forward(const ConvParams& params, ...@@ -268,10 +268,10 @@ void run_reference_convolution_forward(const ConvParams& params,
auto ref_argument = ref_conv.MakeArgument(input, auto ref_argument = ref_conv.MakeArgument(input,
weights, weights,
output, output,
params.conv_filter_strides, params.conv_filter_strides_,
params.conv_filter_dilations, params.conv_filter_dilations_,
params.input_left_pads, params.input_left_pads_,
params.input_right_pads, params.input_right_pads_,
PassThrough{}, PassThrough{},
PassThrough{}, PassThrough{},
PassThrough{}); PassThrough{});
...@@ -437,17 +437,17 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType, ...@@ -437,17 +437,17 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
virtual InTensorsTuple GetInputTensors() const override virtual InTensorsTuple GetInputTensors() const override
{ {
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params_.N), std::vector<std::size_t> input_dims{static_cast<std::size_t>(params_.N_),
static_cast<std::size_t>(params_.C)}; static_cast<std::size_t>(params_.C_)};
input_dims.insert(std::end(input_dims), input_dims.insert(std::end(input_dims),
std::begin(params_.input_spatial_lengths), std::begin(params_.input_spatial_lengths_),
std::end(params_.input_spatial_lengths)); std::end(params_.input_spatial_lengths_));
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params_.K), std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params_.K_),
static_cast<std::size_t>(params_.C)}; static_cast<std::size_t>(params_.C_)};
filter_dims.insert(std::end(filter_dims), filter_dims.insert(std::end(filter_dims),
std::begin(params_.filter_spatial_lengths), std::begin(params_.filter_spatial_lengths_),
std::end(params_.filter_spatial_lengths)); std::end(params_.filter_spatial_lengths_));
auto input = std::make_unique<Tensor<InDataType>>( auto input = std::make_unique<Tensor<InDataType>>(
get_host_tensor_descriptor(input_dims, InLayout{})); get_host_tensor_descriptor(input_dims, InLayout{}));
...@@ -465,8 +465,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType, ...@@ -465,8 +465,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
virtual TensorPtr<OutDataType> GetOutputTensor() const override virtual TensorPtr<OutDataType> GetOutputTensor() const override
{ {
std::vector<std::size_t> output_dims{static_cast<std::size_t>(params_.N), std::vector<std::size_t> output_dims{static_cast<std::size_t>(params_.N_),
static_cast<std::size_t>(params_.K)}; static_cast<std::size_t>(params_.K_)};
output_dims.insert(std::end(output_dims), output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths_), std::begin(output_spatial_lengths_),
std::end(output_spatial_lengths_)); std::end(output_spatial_lengths_));
...@@ -522,16 +522,16 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType, ...@@ -522,16 +522,16 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
static_cast<InDataType*>(in_device_buffers[0]->GetDeviceBuffer()), static_cast<InDataType*>(in_device_buffers[0]->GetDeviceBuffer()),
static_cast<WeiDataType*>(in_device_buffers[1]->GetDeviceBuffer()), static_cast<WeiDataType*>(in_device_buffers[1]->GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buffer->GetDeviceBuffer()), static_cast<OutDataType*>(out_device_buffer->GetDeviceBuffer()),
params_.N, params_.N_,
params_.K, params_.K_,
params_.C, params_.C_,
params_.input_spatial_lengths, params_.input_spatial_lengths_,
params_.filter_spatial_lengths, params_.filter_spatial_lengths_,
output_spatial_lengths_, output_spatial_lengths_,
params_.conv_filter_strides, params_.conv_filter_strides_,
params_.conv_filter_dilations, params_.conv_filter_dilations_,
params_.input_left_pads, params_.input_left_pads_,
params_.input_right_pads, params_.input_right_pads_,
InElementwiseOp{}, InElementwiseOp{},
WeiElementwiseOp{}, WeiElementwiseOp{},
OutElementwiseOp{}); OutElementwiseOp{});
...@@ -539,20 +539,20 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType, ...@@ -539,20 +539,20 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
virtual std::size_t GetFlops() const override virtual std::size_t GetFlops() const override
{ {
return get_flops(params_.N, return get_flops(params_.N_,
params_.C, params_.C_,
params_.K, params_.K_,
params_.filter_spatial_lengths, params_.filter_spatial_lengths_,
output_spatial_lengths_); output_spatial_lengths_);
} }
virtual std::size_t GetBtype() const override virtual std::size_t GetBtype() const override
{ {
return get_btype<InDataType, WeiDataType, OutDataType>(params_.N, return get_btype<InDataType, WeiDataType, OutDataType>(params_.N_,
params_.C, params_.C_,
params_.K, params_.K_,
params_.input_spatial_lengths, params_.input_spatial_lengths_,
params_.filter_spatial_lengths, params_.filter_spatial_lengths_,
output_spatial_lengths_); output_spatial_lengths_);
} }
......
...@@ -128,7 +128,7 @@ class OpInstanceRunEngine ...@@ -128,7 +128,7 @@ class OpInstanceRunEngine
template <typename OpInstancePtr> template <typename OpInstancePtr>
ProfileBestConfig Profile(const std::vector<OpInstancePtr>& op_ptrs, ProfileBestConfig Profile(const std::vector<OpInstancePtr>& op_ptrs,
int nrepeat = 100, bool time_kernel = false,
bool do_verification = false, bool do_verification = false,
bool do_log = false) bool do_log = false)
{ {
...@@ -143,7 +143,7 @@ class OpInstanceRunEngine ...@@ -143,7 +143,7 @@ class OpInstanceRunEngine
if(op_ptr->IsSupportedArgument(argument.get())) if(op_ptr->IsSupportedArgument(argument.get()))
{ {
std::string op_name = op_ptr->GetTypeString(); std::string op_name = op_ptr->GetTypeString();
float avg_time = invoker->Run(argument.get(), nrepeat); float avg_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
std::size_t flops = op_instance_.GetFlops(); std::size_t flops = op_instance_.GetFlops();
std::size_t num_btype = op_instance_.GetBtype(); std::size_t num_btype = op_instance_.GetBtype();
......
...@@ -10,10 +10,31 @@ set(HOST_TENSOR_SOURCE ...@@ -10,10 +10,31 @@ set(HOST_TENSOR_SOURCE
host_tensor.cpp host_tensor.cpp
) )
add_library(host_tensor SHARED ${HOST_TENSOR_SOURCE}) add_library(host_tensor STATIC ${HOST_TENSOR_SOURCE})
add_library(composable_kernel::host_tensor ALIAS host_tensor)
target_compile_features(host_tensor PUBLIC) target_compile_features(host_tensor PUBLIC)
set_target_properties(host_tensor PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(host_tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_include_directories(host_tensor SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>) target_include_directories(host_tensor SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
install(TARGETS host_tensor LIBRARY DESTINATION lib)
target_include_directories(host_tensor PUBLIC
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>"
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/utility>"
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/host_tensor>"
)
install(TARGETS host_tensor
EXPORT host_tensorTargets
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
)
install(EXPORT host_tensorTargets
FILE composable_kernelhost_tensorTargets.cmake
NAMESPACE composable_kernel::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
)
clang_tidy_check(host_tensor) clang_tidy_check(host_tensor)
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size) DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
{ {
hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize)); hip_check_error(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
} }
void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; } void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
...@@ -12,18 +12,17 @@ std::size_t DeviceMem::GetBufferSize() { return mMemSize; } ...@@ -12,18 +12,17 @@ std::size_t DeviceMem::GetBufferSize() { return mMemSize; }
void DeviceMem::ToDevice(const void* p) void DeviceMem::ToDevice(const void* p)
{ {
hipGetErrorString( hip_check_error(hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
} }
void DeviceMem::FromDevice(void* p) void DeviceMem::FromDevice(void* p)
{ {
hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost)); hip_check_error(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
} }
void DeviceMem::SetZero() { hipGetErrorString(hipMemset(mpDeviceBuf, 0, mMemSize)); } void DeviceMem::SetZero() { hip_check_error(hipMemset(mpDeviceBuf, 0, mMemSize)); }
DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); } DeviceMem::~DeviceMem() { hip_check_error(hipFree(mpDeviceBuf)); }
DeviceAlignedMemCPU::DeviceAlignedMemCPU(std::size_t mem_size, std::size_t alignment) DeviceAlignedMemCPU::DeviceAlignedMemCPU(std::size_t mem_size, std::size_t alignment)
: mMemSize(mem_size), mAlignment(alignment) : mMemSize(mem_size), mAlignment(alignment)
...@@ -68,32 +67,32 @@ struct KernelTimerImpl ...@@ -68,32 +67,32 @@ struct KernelTimerImpl
{ {
KernelTimerImpl() KernelTimerImpl()
{ {
hipGetErrorString(hipEventCreate(&mStart)); hip_check_error(hipEventCreate(&mStart));
hipGetErrorString(hipEventCreate(&mEnd)); hip_check_error(hipEventCreate(&mEnd));
} }
~KernelTimerImpl() ~KernelTimerImpl()
{ {
hipGetErrorString(hipEventDestroy(mStart)); hip_check_error(hipEventDestroy(mStart));
hipGetErrorString(hipEventDestroy(mEnd)); hip_check_error(hipEventDestroy(mEnd));
} }
void Start() void Start()
{ {
hipGetErrorString(hipDeviceSynchronize()); hip_check_error(hipDeviceSynchronize());
hipGetErrorString(hipEventRecord(mStart, nullptr)); hip_check_error(hipEventRecord(mStart, nullptr));
} }
void End() void End()
{ {
hipGetErrorString(hipEventRecord(mEnd, nullptr)); hip_check_error(hipEventRecord(mEnd, nullptr));
hipGetErrorString(hipEventSynchronize(mEnd)); hip_check_error(hipEventSynchronize(mEnd));
} }
float GetElapsedTime() const float GetElapsedTime() const
{ {
float time; float time;
hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd)); hip_check_error(hipEventElapsedTime(&time, mStart, mEnd));
return time; return time;
} }
......
...@@ -25,7 +25,7 @@ std::size_t HostTensorDescriptor::GetElementSize() const ...@@ -25,7 +25,7 @@ std::size_t HostTensorDescriptor::GetElementSize() const
std::size_t HostTensorDescriptor::GetElementSpace() const std::size_t HostTensorDescriptor::GetElementSpace() const
{ {
std::size_t space = 1; std::size_t space = 1;
for(int i = 0; i < mLens.size(); ++i) for(std::size_t i = 0; i < mLens.size(); ++i)
{ {
space += (mLens[i] - 1) * mStrides[i]; space += (mLens[i] - 1) * mStrides[i];
} }
...@@ -68,7 +68,7 @@ void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream ...@@ -68,7 +68,7 @@ void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream
// FIXME: remove // FIXME: remove
void bf16_to_f32_(const Tensor<ck::bhalf_t>& src, Tensor<float>& dst) void bf16_to_f32_(const Tensor<ck::bhalf_t>& src, Tensor<float>& dst)
{ {
for(int i = 0; i < src.mData.size(); ++i) for(std::size_t i = 0; i < src.mData.size(); ++i)
dst.mData[i] = ck::type_convert<float>(src.mData[i]); dst.mData[i] = ck::type_convert<float>(src.mData[i]);
} }
#endif #endif
...@@ -11,6 +11,7 @@ include_directories(BEFORE ...@@ -11,6 +11,7 @@ include_directories(BEFORE
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/thread ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/thread
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/element ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/element
${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor ${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
${PROJECT_SOURCE_DIR}/library/include/ck/library/host
${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance ${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance
${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance/gpu/reduce ${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance/gpu/reduce
${PROJECT_SOURCE_DIR}/external/include/half ${PROJECT_SOURCE_DIR}/external/include/half
...@@ -18,7 +19,7 @@ include_directories(BEFORE ...@@ -18,7 +19,7 @@ include_directories(BEFORE
function(add_instance_library INSTANCE_NAME) function(add_instance_library INSTANCE_NAME)
message("adding instance ${INSTANCE_NAME}") message("adding instance ${INSTANCE_NAME}")
add_library(${INSTANCE_NAME} SHARED ${ARGN}) add_library(${INSTANCE_NAME} OBJECT ${ARGN})
target_compile_features(${INSTANCE_NAME} PUBLIC) target_compile_features(${INSTANCE_NAME} PUBLIC)
set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endfunction(add_instance_library INSTANCE_NAME) endfunction(add_instance_library INSTANCE_NAME)
...@@ -41,3 +42,73 @@ add_subdirectory(convnd_bwd_data) ...@@ -41,3 +42,73 @@ add_subdirectory(convnd_bwd_data)
add_subdirectory(grouped_gemm) add_subdirectory(grouped_gemm)
add_subdirectory(conv2d_bwd_weight) add_subdirectory(conv2d_bwd_weight)
add_subdirectory(batched_gemm_reduce) add_subdirectory(batched_gemm_reduce)
add_library(device_operations STATIC
$<TARGET_OBJECTS:device_conv1d_fwd_instance>
$<TARGET_OBJECTS:device_batched_gemm_instance>
$<TARGET_OBJECTS:device_conv2d_bwd_data_instance>
$<TARGET_OBJECTS:device_conv2d_fwd_instance>
$<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_instance>
$<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_add_instance>
$<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_atomic_add_instance>
$<TARGET_OBJECTS:device_gemm_instance>
$<TARGET_OBJECTS:device_gemm_bias_relu_instance>
$<TARGET_OBJECTS:device_gemm_bias_relu_add_instance>
$<TARGET_OBJECTS:device_gemm_bias2d_instance>
$<TARGET_OBJECTS:device_reduce_instance>
$<TARGET_OBJECTS:device_convnd_bwd_data_instance>
$<TARGET_OBJECTS:device_grouped_gemm_instance>
$<TARGET_OBJECTS:device_conv2d_bwd_weight_instance>
$<TARGET_OBJECTS:device_batched_gemm_reduce_instance>
$<TARGET_OBJECTS:device_conv3d_fwd_instance>
device_conv2d.cpp
)
add_library(composablekernels::device_operations ALIAS device_operations)
set(DEV_OPS_INC_DIRS
${PROJECT_SOURCE_DIR}/include/ck/
${PROJECT_SOURCE_DIR}/library/include/ck/
${PROJECT_SOURCE_DIR}/external/include/
)
target_compile_features(device_operations PUBLIC)
set_target_properties(device_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_include_directories(device_operations PUBLIC
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/utility>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_description>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/problem_transform>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/device>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/grid>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/block>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/warp>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/thread>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/element>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/host_tensor>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/host>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/reduce>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/half>
)
#once new arches are enabled make this an option on the main cmake file
# and pass down here to be exported
target_compile_options(device_operations
PRIVATE --offload-arch=gfx908
)
# install(TARGETS device_operations LIBRARY DESTINATION lib)
install(TARGETS device_operations
EXPORT device_operationsTargets
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
)
install(DIRECTORY ${DEV_OPS_INC_DIRS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck)
install(EXPORT device_operationsTargets
FILE composable_kerneldevice_operationsTargets.cmake
NAMESPACE composable_kernel::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
)
...@@ -18,9 +18,9 @@ set(DEVICE_BATCHED_GEMM_INSTANCE_SOURCE ...@@ -18,9 +18,9 @@ set(DEVICE_BATCHED_GEMM_INSTANCE_SOURCE
device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp; device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp;
) )
add_library(device_batched_gemm_instance SHARED ${DEVICE_BATCHED_GEMM_INSTANCE_SOURCE}) add_library(device_batched_gemm_instance OBJECT ${DEVICE_BATCHED_GEMM_INSTANCE_SOURCE})
target_compile_features(device_batched_gemm_instance PUBLIC) # target_compile_features(device_batched_gemm_instance PUBLIC)
set_target_properties(device_batched_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(device_batched_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
install(TARGETS device_batched_gemm_instance LIBRARY DESTINATION lib) # install(TARGETS device_batched_gemm_instance LIBRARY DESTINATION lib)
clang_tidy_check(device_batched_gemm_instance) clang_tidy_check(device_batched_gemm_instance)
...@@ -5,7 +5,8 @@ set(DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE ...@@ -5,7 +5,8 @@ set(DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE
device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp
) )
add_instance_library(device_batched_gemm_reduce_instance ${DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE}) add_instance_library(device_batched_gemm_reduce_instance OBJECT ${DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE})
install(TARGETS device_batched_gemm_reduce_instance LIBRARY DESTINATION lib) target_compile_features(device_batched_gemm_reduce_instance PUBLIC)
set_target_properties(device_batched_gemm_reduce_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
clang_tidy_check(device_batched_gemm_reduce_instance) clang_tidy_check(device_batched_gemm_reduce_instance)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment