"vscode:/vscode.git/clone" did not exist on "22827c8e82ca589b96e0144f5c74572877ae0e50"
Commit 7fd0e649 authored by Chaitanya Inumella's avatar Chaitanya Inumella
Browse files

Added the changes for the contraction adapted to the cuTENSOR library style

parent 667e52cc
#include <iostream> #include <iostream>
#include <fstream>
#include <numeric> #include <numeric>
#include <initializer_list> #include <initializer_list>
#include <cstdlib> #include <cstdlib>
...@@ -37,7 +38,8 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; ...@@ -37,7 +38,8 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
using BElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough;
using CElementOp = ck::tensor_operation::element_wise::PassThrough; using CElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; //static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
// clang-format off // clang-format off
using DeviceOpInstance = ck::tensor_operation::device:: using DeviceOpInstance = ck::tensor_operation::device::
...@@ -48,6 +50,23 @@ using DeviceOpInstance = ck::tensor_operation::device:: ...@@ -48,6 +50,23 @@ using DeviceOpInstance = ck::tensor_operation::device::
DeviceContraction_Xdl_CShuffle< NumDimM, NumDimN, NumDimK, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>; DeviceContraction_Xdl_CShuffle< NumDimM, NumDimN, NumDimK, F32, F32, F32, F32, F32, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>;
// clang-format on // clang-format on
template <typename T, typename Range>
void LogRangeToFile(std::ofstream& fs, Range&& range, std::string delim)
{
bool first = true;
for(auto&& v : range)
{
if(first)
first = false;
else
fs << delim;
fs << static_cast<T>(v);
}
return;
}
// hardcoded for NumDimM == NumDimN == NumDimK == 2 // hardcoded for NumDimM == NumDimN == NumDimK == 2
template <ck::index_t NumDimM, template <ck::index_t NumDimM,
ck::index_t NumDimN, ck::index_t NumDimN,
...@@ -197,7 +216,7 @@ using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM, ...@@ -197,7 +216,7 @@ using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
bool do_verification = true; bool do_verification = true;
int init_method = 1; int init_method = 3;
bool time_kernel = false; bool time_kernel = false;
if(argc == 4) if(argc == 4)
...@@ -209,21 +228,42 @@ int main(int argc, char* argv[]) ...@@ -209,21 +228,42 @@ int main(int argc, char* argv[])
else else
{ {
printf("arg1: verification (0=no, 1=yes)\n"); printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value, 3=cutensor_style_init)\n");
printf("arg3: time kernel (0=no, 1=yes)\n"); printf("arg3: time kernel (0=no, 1=yes)\n");
exit(0); exit(0);
} }
// A[M0, M1, K0, K1] std::ofstream tensorA;
std::ofstream tensorB;
std::ofstream tensorC;
std::ofstream tensorC_d;
std::cout << "RAND_MAX value is " << RAND_MAX << std::endl;
#if 0
// a[m0, m1, k0, k1]
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64}; std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1}; //std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
// B[K0, K1, N0, N1] // b[k0, k1, n0, n1]
std::vector<ck::index_t> b_ks_ns_lengths{32, 64, 32, 64}; std::vector<ck::index_t> b_ks_ns_lengths{32, 64, 32, 64};
std::vector<ck::index_t> b_ks_ns_strides{128, 1, 524288, 4096}; //std::vector<ck::index_t> b_ks_ns_strides{128, 1, 524288, 4096};
// C[M0, M1, N0, N1] // c[m0, m1, n0, n1]
std::vector<ck::index_t> c_ms_ns_lengths{30, 128, 32, 64}; std::vector<ck::index_t> c_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> c_ms_ns_strides{524288, 4096, 128, 1}; //std::vector<ck::index_t> c_ms_ns_strides{524288, 4096, 128, 1};
#else
// a[m0, m1, k0, k1]
std::vector<ck::index_t> a_ms_ks_lengths{5,6,3,4};
//std::vector<ck::index_t> a_ms_ks_strides{108,20,16,1};
// b[k0, k1, n0, n1]
std::vector<ck::index_t> b_ks_ns_lengths{3,4,3,4};
//std::vector<ck::index_t> b_ks_ns_strides{16,1,108,20};
// c[m0, m1, n0, n1]
std::vector<ck::index_t> c_ms_ns_lengths{5,6,3,4};
//std::vector<ck::index_t> c_ms_ns_strides{108,20,16,1};
#endif
#if 0
Tensor<ADataType> a_ms_ks( Tensor<ADataType> a_ms_ks(
std::vector<std::size_t>(a_ms_ks_lengths.begin(), a_ms_ks_lengths.end()), std::vector<std::size_t>(a_ms_ks_lengths.begin(), a_ms_ks_lengths.end()),
std::vector<std::size_t>(a_ms_ks_strides.begin(), a_ms_ks_strides.end())); std::vector<std::size_t>(a_ms_ks_strides.begin(), a_ms_ks_strides.end()));
...@@ -236,6 +276,16 @@ int main(int argc, char* argv[]) ...@@ -236,6 +276,16 @@ int main(int argc, char* argv[])
Tensor<CDataType> c_ms_ns_device_result( Tensor<CDataType> c_ms_ns_device_result(
std::vector<std::size_t>(c_ms_ns_lengths.begin(), c_ms_ns_lengths.end()), std::vector<std::size_t>(c_ms_ns_lengths.begin(), c_ms_ns_lengths.end()),
std::vector<std::size_t>(c_ms_ns_strides.begin(), c_ms_ns_strides.end())); std::vector<std::size_t>(c_ms_ns_strides.begin(), c_ms_ns_strides.end()));
#else
Tensor<ADataType> a_ms_ks(
std::vector<std::size_t>(a_ms_ks_lengths.begin(), a_ms_ks_lengths.end()));
Tensor<BDataType> b_ks_ns(
std::vector<std::size_t>(b_ks_ns_lengths.begin(), b_ks_ns_lengths.end()));
Tensor<CDataType> c_ms_ns_host_result(
std::vector<std::size_t>(c_ms_ns_lengths.begin(), c_ms_ns_lengths.end()));
Tensor<CDataType> c_ms_ns_device_result(
std::vector<std::size_t>(c_ms_ns_lengths.begin(), c_ms_ns_lengths.end()));
#endif
std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl; std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl;
std::cout << "b_ks_ns: " << b_ks_ns.mDesc << std::endl; std::cout << "b_ks_ns: " << b_ks_ns.mDesc << std::endl;
...@@ -252,6 +302,10 @@ int main(int argc, char* argv[]) ...@@ -252,6 +302,10 @@ int main(int argc, char* argv[])
a_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}); a_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_ks_ns.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}); b_ks_ns.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
break; break;
case 3:
a_ms_ks.GenerateTensorValue(GeneratorTensor_cuTensor<ADataType>{});
b_ks_ns.GenerateTensorValue(GeneratorTensor_cuTensor<BDataType>{});
break;
default: default:
a_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<0>{}); a_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<0>{});
b_ks_ns.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); b_ks_ns.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
...@@ -261,6 +315,10 @@ int main(int argc, char* argv[]) ...@@ -261,6 +315,10 @@ int main(int argc, char* argv[])
DeviceMem b_ks_ns_device_buf(sizeof(BDataType) * b_ks_ns.mDesc.GetElementSpace()); DeviceMem b_ks_ns_device_buf(sizeof(BDataType) * b_ks_ns.mDesc.GetElementSpace());
DeviceMem c_ms_ns_device_buf(sizeof(CDataType) * c_ms_ns_device_result.mDesc.GetElementSpace()); DeviceMem c_ms_ns_device_buf(sizeof(CDataType) * c_ms_ns_device_result.mDesc.GetElementSpace());
std::cout << "Tensor A element space: " << a_ms_ks.mDesc.GetElementSpace() << std::endl;
std::cout << "Tensor B element space: " << b_ks_ns.mDesc.GetElementSpace() << std::endl;
std::cout << "Tensor C element space: " << c_ms_ns_device_result.mDesc.GetElementSpace() << std::endl;
a_ms_ks_device_buf.ToDevice(a_ms_ks.mData.data()); a_ms_ks_device_buf.ToDevice(a_ms_ks.mData.data());
b_ks_ns_device_buf.ToDevice(b_ks_ns.mData.data()); b_ks_ns_device_buf.ToDevice(b_ks_ns.mData.data());
...@@ -278,11 +336,11 @@ int main(int argc, char* argv[]) ...@@ -278,11 +336,11 @@ int main(int argc, char* argv[])
static_cast<BDataType*>(b_ks_ns_device_buf.GetDeviceBuffer()), static_cast<BDataType*>(b_ks_ns_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_ms_ns_device_buf.GetDeviceBuffer()), static_cast<CDataType*>(c_ms_ns_device_buf.GetDeviceBuffer()),
a_ms_ks_lengths, a_ms_ks_lengths,
a_ms_ks_strides, std::vector<ck::index_t>(a_ms_ks.mDesc.mStrides.begin(), a_ms_ks.mDesc.mStrides.end()),
b_ks_ns_lengths, b_ks_ns_lengths,
b_ks_ns_strides, std::vector<ck::index_t>(b_ks_ns.mDesc.mStrides.begin(), b_ks_ns.mDesc.mStrides.end()),
c_ms_ns_lengths, c_ms_ns_lengths,
c_ms_ns_strides, std::vector<ck::index_t>(c_ms_ns_host_result.mDesc.mStrides.begin(), c_ms_ns_host_result.mDesc.mStrides.end()),
a_element_op, a_element_op,
b_element_op, b_element_op,
c_element_op); c_element_op);
...@@ -324,6 +382,17 @@ int main(int argc, char* argv[]) ...@@ -324,6 +382,17 @@ int main(int argc, char* argv[])
c_ms_ns_device_buf.FromDevice(c_ms_ns_device_result.mData.data()); c_ms_ns_device_buf.FromDevice(c_ms_ns_device_result.mData.data());
tensorA.open("tensor_A.txt");
LogRangeToFile<ADataType>(tensorA, a_ms_ks.mData, ",");
LogRangeAsType<ADataType>(std::cout<<"Tensor A elements:\n", a_ms_ks.mData,",");
std::cout<<std::endl;
tensorA.close();
tensorB.open("tensor_B.txt");
LogRangeToFile<BDataType>(tensorB, b_ks_ns.mData, ",");
LogRangeAsType<BDataType>(std::cout<<"Tensor B elements:\n", b_ks_ns.mData,",");
std::cout<<std::endl;
tensorB.close();
if(do_verification) if(do_verification)
{ {
auto ref_gemm = ReferenceOpInstance{}; auto ref_gemm = ReferenceOpInstance{};
...@@ -334,6 +403,19 @@ int main(int argc, char* argv[]) ...@@ -334,6 +403,19 @@ int main(int argc, char* argv[])
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
tensorC.open("tensor_C_contraction_host_results.txt");
LogRangeToFile<CDataType>(tensorC, c_ms_ns_host_result.mData, ",");
LogRangeAsType<CDataType>(std::cout<<"Tensor C_host elements:\n", c_ms_ns_host_result.mData, ",");
std::cout<<std::endl;
tensorC.close();
tensorC.open("tensor_C_contraction_device_results.txt");
LogRangeToFile<CDataType>(tensorC_d, c_ms_ns_device_result.mData, ",");
LogRangeAsType<CDataType>(std::cout<<"Tensor C_device elements:\n", c_ms_ns_device_result.mData, ",");
std::cout<<std::endl;
tensorC.close();
return ck::utils::check_err(c_ms_ns_device_result.mData, c_ms_ns_host_result.mData) ? 0 : 1; return ck::utils::check_err(c_ms_ns_device_result.mData, c_ms_ns_host_result.mData) ? 0 : 1;
} }
......
...@@ -32,7 +32,7 @@ inline void hip_check_error(hipError_t x) ...@@ -32,7 +32,7 @@ inline void hip_check_error(hipError_t x)
struct DeviceMem struct DeviceMem
{ {
DeviceMem() = default; DeviceMem() = delete;
DeviceMem(std::size_t mem_size); DeviceMem(std::size_t mem_size);
void* GetDeviceBuffer(); void* GetDeviceBuffer();
std::size_t GetBufferSize(); std::size_t GetBufferSize();
......
...@@ -109,7 +109,7 @@ struct HostTensorDescriptor ...@@ -109,7 +109,7 @@ struct HostTensorDescriptor
friend std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc); friend std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc);
private: //private:
std::vector<std::size_t> mLens; std::vector<std::size_t> mLens;
std::vector<std::size_t> mStrides; std::vector<std::size_t> mStrides;
}; };
......
...@@ -74,6 +74,7 @@ struct GeneratorTensor_2<ck::bhalf_t> ...@@ -74,6 +74,7 @@ struct GeneratorTensor_2<ck::bhalf_t>
ck::bhalf_t operator()(Is...) ck::bhalf_t operator()(Is...)
{ {
float tmp = (std::rand() % (max_value - min_value)) + min_value; float tmp = (std::rand() % (max_value - min_value)) + min_value;
std::cout << tmp << ",";
return ck::type_convert<ck::bhalf_t>(tmp); return ck::type_convert<ck::bhalf_t>(tmp);
} }
}; };
...@@ -123,6 +124,35 @@ struct GeneratorTensor_3<ck::bhalf_t> ...@@ -123,6 +124,35 @@ struct GeneratorTensor_3<ck::bhalf_t>
} }
}; };
template <typename T>
struct GeneratorTensor_cuTensor
{
//int min_value = 0;
//int max_value = 1;
template <typename... Is>
T operator()(Is...)
{
float tmp = ((float(std::rand()))/RAND_MAX - 0.5)*100;
return static_cast<T>(tmp);
}
};
template <>
struct GeneratorTensor_cuTensor<ck::bhalf_t>
{
//int min_value = 0;
//int max_value = 1;
template <typename... Is>
ck::bhalf_t operator()(Is...)
{
float tmp = ((float(std::rand()))/RAND_MAX - 0.5)*100;
return ck::type_convert<ck::bhalf_t>(tmp);
}
};
struct GeneratorTensor_Checkboard struct GeneratorTensor_Checkboard
{ {
template <typename... Ts> template <typename... Ts>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment