Commit 951a52b2 authored by letaoqin's avatar letaoqin
Browse files

rcr change to rrr

parent 635b5904
......@@ -154,17 +154,20 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
DeviceMem a_device_buf_re(sizeof(ADataType) * a_ms_ks_re.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf_re(sizeof(BDataType) * b_ns_ks_re.mDesc.GetElementSpaceSize());
DeviceMem d_device_buf_re(sizeof(DDataType) * d_ms_ns_re.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf_re(sizeof(EDataType) * e_ms_ns_device_result_re.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf_re(sizeof(EDataType) *
e_ms_ns_device_result_re.mDesc.GetElementSpaceSize());
DeviceMem a_device_buf_img(sizeof(ADataType) * a_ms_ks_img.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf_img(sizeof(BDataType) * b_ns_ks_img.mDesc.GetElementSpaceSize());
DeviceMem d_device_buf_img(sizeof(DDataType) * d_ms_ns_img.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf_img(sizeof(EDataType) * e_ms_ns_device_result_img.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf_img(sizeof(EDataType) *
e_ms_ns_device_result_img.mDesc.GetElementSpaceSize());
// Intermediate Value For E Real and Img
DeviceMem e_device_buf_re1(sizeof(EDataType) * e_ms_ns_device_result_re.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf_img1(sizeof(EDataType) * e_ms_ns_device_result_img.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf_re1(sizeof(EDataType) *
e_ms_ns_device_result_re.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf_img1(sizeof(EDataType) *
e_ms_ns_device_result_img.mDesc.GetElementSpaceSize());
a_device_buf_re.ToDevice(a_ms_ks_re.mData.data());
b_device_buf_re.ToDevice(b_ns_ks_re.mData.data());
......@@ -191,7 +194,8 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
auto op = DeviceOpInstance{};
auto invoker = op.MakeInvoker();
auto argument_re1 = op.MakeArgument(a_device_buf_re.GetDeviceBuffer(),
auto argument_re1 =
op.MakeArgument(a_device_buf_re.GetDeviceBuffer(),
b_device_buf_re.GetDeviceBuffer(),
std::array<const void*, 1>{d_device_buf_re.GetDeviceBuffer()},
e_device_buf_re1.GetDeviceBuffer(),
......@@ -216,7 +220,6 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
float ave_time_re1 = invoker.Run(argument_re1, StreamConfig{nullptr, time_kernel});
alpha = -1.f;
beta = 1.f;
......@@ -228,7 +231,8 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
// For real Intermediate Value re_2
// auto op = DeviceOpInstance{};
// auto invoker = op.MakeInvoker();
auto argument_re2 = op.MakeArgument(a_device_buf_img.GetDeviceBuffer(),
auto argument_re2 =
op.MakeArgument(a_device_buf_img.GetDeviceBuffer(),
b_device_buf_img.GetDeviceBuffer(),
std::array<const void*, 1>{e_device_buf_re1.GetDeviceBuffer()},
e_device_buf_re.GetDeviceBuffer(),
......@@ -253,7 +257,6 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
float ave_time_re2 = invoker.Run(argument_re2, StreamConfig{nullptr, time_kernel});
alpha = 1.f;
beta = 1.f;
......@@ -261,7 +264,8 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
b_element_op = BElementOp{};
cde_element_op = CDEElementOp{alpha, beta};
auto argument_img1 = op.MakeArgument(a_device_buf_re.GetDeviceBuffer(),
auto argument_img1 =
op.MakeArgument(a_device_buf_re.GetDeviceBuffer(),
b_device_buf_img.GetDeviceBuffer(),
std::array<const void*, 1>{d_device_buf_img.GetDeviceBuffer()},
e_device_buf_img1.GetDeviceBuffer(),
......@@ -277,7 +281,6 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
b_element_op,
cde_element_op);
if(!op.IsSupportedArgument(argument_img1))
{
std::cout << op.GetTypeString() << " does not support this problem" << std::endl;
......@@ -290,7 +293,8 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
alpha = 1.f;
beta = 1.f;
auto argument_img2 = op.MakeArgument(a_device_buf_img.GetDeviceBuffer(),
auto argument_img2 =
op.MakeArgument(a_device_buf_img.GetDeviceBuffer(),
b_device_buf_re.GetDeviceBuffer(),
std::array<const void*, 1>{e_device_buf_img1.GetDeviceBuffer()},
e_device_buf_img.GetDeviceBuffer(),
......@@ -306,8 +310,6 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
b_element_op,
cde_element_op);
if(!op.IsSupportedArgument(argument_img2))
{
std::cout << op.GetTypeString() << " does not support this problem" << std::endl;
......@@ -317,7 +319,6 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
float ave_time_img2 = invoker.Run(argument_img2, StreamConfig{nullptr, time_kernel});
ck::index_t M =
ck::accumulate_n<ck::index_t>(e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
......@@ -331,7 +332,7 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
sizeof(DDataType) * M * N + sizeof(EDataType) * M * N * 2;
float ave_time = ave_time_img2 + ave_time_img1 + ave_time_re2 + ave_time_re1 ;
float ave_time = ave_time_img2 + ave_time_img1 + ave_time_re2 + ave_time_re1;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
......@@ -366,8 +367,8 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
auto ref_op = ReferenceOpInstance{};
auto ref_invoker = ref_op.MakeInvoker();
auto ref_argument_re =
ref_op.MakeArgument(a_ms_ks_re, b_ns_ks_re, c_ms_ns_host_result_re, a_element_op, b_element_op);
auto ref_argument_re = ref_op.MakeArgument(
a_ms_ks_re, b_ns_ks_re, c_ms_ns_host_result_re, a_element_op, b_element_op);
ref_invoker.Run(ref_argument_re);
......@@ -376,7 +377,6 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
cde_element_op = CDEElementOp{alpha, beta};
for(size_t m0 = 0; m0 < e_ms_ns_host_result_re.mDesc.GetLengths()[0]; ++m0)
{
for(size_t m1 = 0; m1 < e_ms_ns_host_result_re.mDesc.GetLengths()[1]; ++m1)
......@@ -398,8 +398,8 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
cde_element_op = CDEElementOp{alpha, beta};
auto ref_argument_re1 =
ref_op.MakeArgument(a_ms_ks_img, b_ns_ks_img, c_ms_ns_host_result_re1, a_element_op, b_element_op);
auto ref_argument_re1 = ref_op.MakeArgument(
a_ms_ks_img, b_ns_ks_img, c_ms_ns_host_result_re1, a_element_op, b_element_op);
ref_invoker.Run(ref_argument_re1);
......@@ -421,15 +421,12 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
isRealOk = ck::utils::check_err(e_ms_ns_device_result_re, e_ms_ns_host_result_re) ? 0 : 1;
// Img Part Verification
Tensor<CShuffleDataType> c_ms_ns_host_result_img(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<CShuffleDataType> c_ms_ns_host_result_img1(e_ms_ns_lengths, e_ms_ns_strides);
auto ref_argument_img =
ref_op.MakeArgument(a_ms_ks_re, b_ns_ks_img, c_ms_ns_host_result_img, a_element_op, b_element_op);
auto ref_argument_img = ref_op.MakeArgument(
a_ms_ks_re, b_ns_ks_img, c_ms_ns_host_result_img, a_element_op, b_element_op);
ref_invoker.Run(ref_argument_img);
......@@ -454,8 +451,8 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
}
}
auto ref_argument_img1 =
ref_op.MakeArgument(a_ms_ks_img, b_ns_ks_re, c_ms_ns_host_result_img1, a_element_op, b_element_op);
auto ref_argument_img1 = ref_op.MakeArgument(
a_ms_ks_img, b_ns_ks_re, c_ms_ns_host_result_img1, a_element_op, b_element_op);
ref_invoker.Run(ref_argument_img1);
......
......@@ -21,7 +21,7 @@ using AccDataType = F32;
using CShuffleDataType = F32;
using ALayout = Row;
using BLayout = Col;
using BLayout = Row;
using D0Layout = Row;
using DsLayout = ck::Tuple<D0Layout>;
using CLayout = Row;
......@@ -41,7 +41,7 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio
// clang-format off
template <typename ADataType, typename BDataType, typename DsDataType, typename CDataType>
using DeviceOpInstance_64_16_16_64 = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShuffle_V3<
Row, Col, DsLayout, CLayout, ADataType, BDataType,
ALayout, BLayout, DsLayout, CLayout, ADataType, BDataType,
DsDataType, CDataType, AccDataType, CShuffleDataType,
AElementOp, BElementOp, CDEElementOp, GemmSpec,
64,
......@@ -51,14 +51,14 @@ using DeviceOpInstance_64_16_16_64 = ck::tensor_operation::device::DeviceGemmMul
1, 1,
S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 8, 8, 0,
S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 8, 8, 0,
S<8, 8, 1>, S<0, 2, 1>, S<0, 2, 1>,
1, 2, 2, 0,
1, 1,
S<1, 16, 1, 4>, S<4, 4>, ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, F16>;
template <typename ADataType, typename BDataType, typename DsDataType, typename CDataType>
using DeviceOpInstance_default = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShuffle_V3<
Row, Col, DsLayout, CLayout, ADataType, BDataType,
ALayout, BLayout, DsLayout, CLayout, ADataType, BDataType,
DsDataType, CDataType, AccDataType, CShuffleDataType,
AElementOp, BElementOp, CDEElementOp, GemmSpec,
64,
......@@ -68,10 +68,10 @@ using DeviceOpInstance_default = ck::tensor_operation::device::DeviceGemmMultiD_
1, 1,
S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 1, 1, 0,
S<8, 8, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 1, 1, 0,
S<8, 8, 1>, S<0, 2, 1>, S<0, 2, 1>,
1, 1, 1, 0,
1, 1,
S<1, 16, 1, 4>, S<1, 1>, ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, F16>;
S<1, 16, 1, 4>, S<2, 2>, ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, F16>;
// clang-format on
......@@ -97,7 +97,7 @@ float gemm_bias_add_fp16(const GemmBiasAddArgs& args, const StreamConfig& config
auto cde_element_op = CDEElementOp{};
ck::index_t StrideA = args.K;
ck::index_t StrideB = args.K;
ck::index_t StrideB = args.N;
ck::index_t StrideD = 0;
ck::index_t StrideC = args.N;
......@@ -116,6 +116,7 @@ float gemm_bias_add_fp16(const GemmBiasAddArgs& args, const StreamConfig& config
StrideB,
std::array<ck::index_t, NumDTensor>{StrideD},
StrideC,
1,
a_element_op,
b_element_op,
cde_element_op);
......
......@@ -32,36 +32,25 @@ using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using A0Layout = Row;
using B0Layout = Col;
using B0Layout = Row;
using D0Layout = Row;
using DsLayout = ck::Tuple<D0Layout>;
using ELayout = Row;
void RunUnfusedTest(const std::vector<ck::half_t>& mat_A,
const std::vector<ck::half_t>& mat_B,
const std::vector<ck::half_t>& mat_C,
std::vector<ck::half_t>& mat_D,
int K,
int M,
int N)
{
for(int m = 0; m < M; m++)
{
std::vector<float> tmp;
for(int n = 0; n < N; n++)
{
float psum = 0.f;
for(int k = 0; k < K; k++)
{
float areg = float(mat_A[m * K + k]);
float breg = float(mat_B[n * K + k]);
psum += areg * breg;
}
psum += ck::type_convert<float>(mat_C[n]);
mat_D[m * N + n] = ck::type_convert<ck::half_t>(psum);
}
}
}
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
// using Add = ck::tensor_operation::element_wise::Add;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<A0DataType,
B0DataType,
EDataType,
AccDataType,
AElementOp,
BElementOp,
CElementOp>;
int main(int argc, char* argv[])
{
......@@ -70,12 +59,12 @@ int main(int argc, char* argv[])
bool time_kernel = true;
// GEMM shape
ck::index_t M = 512;
ck::index_t N = 1024;
ck::index_t K = 256;
ck::index_t M = 16;
ck::index_t N = 16;
ck::index_t K = 64;
ck::index_t StrideA = K;
ck::index_t StrideB = K;
ck::index_t StrideB = N;
ck::index_t StrideD = 0;
ck::index_t StrideE = N;
......@@ -143,12 +132,12 @@ int main(int argc, char* argv[])
case 1:
a0_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{-0.5, 0.5});
b0_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{-0.5, 0.5});
d0_m_n.GenerateTensorValue(GeneratorTensor_1<D0DataType>{0});
break;
default:
a0_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
b0_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{-0.5, 0.5});
d0_m_n.GenerateTensorValue(GeneratorTensor_1<D0DataType>{0});
}
DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize());
......@@ -188,7 +177,15 @@ int main(int argc, char* argv[])
if(do_verification)
{
RunUnfusedTest(a0_m_k.mData, b0_k_n.mData, d0_m_n.mData, e_m_n_host_result.mData, K, M, N);
// RunUnfusedTest(a0_m_k.mData, b0_k_n.mData, d0_m_n.mData, e_m_n_host_result.mData, K, M,
// N);
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a0_m_k, b0_k_n, e_m_n_host_result, AElementOp{}, BElementOp{}, CElementOp{});
ref_invoker.Run(ref_argument);
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment