Commit f906973c authored by Jing Zhang's avatar Jing Zhang
Browse files

added an example of amax of a matrix

parent bc1108bb
add_example_executable(example_reduce_blockwise reduce_blockwise.cpp) add_example_executable(example_reduce_blockwise reduce_blockwise.cpp)
add_example_executable(example_reduce_multiblock_atomic_add reduce_multiblock_atomic_add.cpp) add_example_executable(example_reduce_multiblock_atomic_add reduce_multiblock_atomic_add.cpp)
add_example_executable(example_reduce_blockwise_two_call reduce_blockwise_two_call.cpp) add_example_executable(example_reduce_blockwise_two_call reduce_blockwise_two_call.cpp)
add_example_executable(example_reduce_blockwise_two_call_amax reduce_blockwise_two_call_amax.cpp)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <sstream>
#include <initializer_list>
#include <cstdlib>
#include <getopt.h>
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_common_util.hpp"
using namespace ck;
using namespace ck::tensor_operation::device;
using InOutDataType = ck::half_t;
using InOutDataType = ck::half_t;
using AccDataType = float;
constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AMAX;
constexpr bool PropagateNan = true;
constexpr bool OutputIndex = false;
using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
using InElementwiseOperation =
typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
using AccElementwiseOperation =
typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
using PassThroughOp = tensor_operation::element_wise::PassThrough;
using DeviceReduceInstance_1 = DeviceReduceMultiBlock<InOutDataType,
AccDataType,
InOutDataType,
2, // Rank
1, // NumReduceDim
ReduceOperation,
InElementwiseOperation,
PassThroughOp,
InMemoryDataOperationEnum::Set,
PropagateNan,
OutputIndex,
false, // HaveIndexInputIfOutputIndex
256,
32,
8,
1,
1,
1, // vector dim
1,
1>;
using DeviceReduceInstance_2 = DeviceReduceMultiBlock<InOutDataType,
AccDataType,
InOutDataType,
1, // Rank
1, // NumReduceDim
ReduceOperation,
PassThroughOp,
AccElementwiseOperation,
InMemoryDataOperationEnum::Set,
PropagateNan,
OutputIndex,
false, // HaveIndexInputIfOutputIndex
256,
128,
2,
1,
1,
1, // vector dim
1,
1>;
static bool do_verify;
static int init_method;
static float alpha;
static float beta;
static bool time_kernel;
int main(int argc, char* argv[])
{
// used by the device reduction
const std::array<int, 1> reduceDims_1 = {0};
const std::array<int, 1> reduceDims_2 = {0};
// used by the host reduction
const std::array<int, 2> reduceDims = {0, 1};
const std::vector<size_t> inLengths_1 = {8192, 8192};
// input lengths of the second reduction, which is also the output lengths of the first
// reduction
const std::vector<size_t> inLengths_2 = {8192};
const std::vector<size_t> outLengths = {1};
if(argc == 1)
{
do_verify = true;
init_method = 2;
time_kernel = true;
}
else if(argc == 4)
{
do_verify = static_cast<bool>(argv[1]);
init_method = atoi(argv[2]);
time_kernel = static_cast<bool>(atoi(argv[3]));
}
else
{
std::ostringstream ostr;
ostr << "Wrong parameter! " << std::endl
<< "Usage: " << argv[0] << "[verify 0/1] init_method time_kernel" << std::endl;
throw std::runtime_error(ostr.str());
};
alpha = 1.0f;
beta = 0.0f;
Tensor<InOutDataType> in_1(inLengths_1);
Tensor<InOutDataType> out_ref(outLengths);
Tensor<InOutDataType> in_2(inLengths_2); // also the output tensor of the first reduction
Tensor<InOutDataType> out(outLengths);
auto inStrides_1 = in_1.mDesc.GetStrides();
auto inStrides_2 = in_2.mDesc.GetStrides();
auto outStrides = out.mDesc.GetStrides();
size_t invariant_total_length = out.mDesc.GetElementSize();
size_t reduce_total_length = in_1.mDesc.GetElementSize() / invariant_total_length;
std::size_t num_thread = 1;
if(do_verify)
{
switch(init_method)
{
case 0: break;
case 1:
in_1.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
if(beta != 0.0f)
out_ref.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
break;
case 2:
in_1.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
if(beta != 0.0f)
out_ref.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
break;
default:
in_1.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0}, num_thread);
if(beta != 0.0f)
out_ref.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0},
num_thread);
}
if(beta != 0.0f)
for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++)
out.mData[i] = out_ref.mData[i];
};
DeviceMem in_1_dev(sizeof(InOutDataType) * in_1.mDesc.GetElementSpaceSize());
DeviceMem in_2_dev(sizeof(InOutDataType) * in_2.mDesc.GetElementSpaceSize());
DeviceMem out_dev(sizeof(InOutDataType) * out.mDesc.GetElementSpaceSize());
in_1_dev.ToDevice(in_1.mData.data());
if(beta != 0.0f)
out_dev.ToDevice(out.mData.data());
InElementwiseOperation in_elementwise_op;
AccElementwiseOperation acc_elementwise_op;
std::tie(in_elementwise_op, acc_elementwise_op) =
reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
static_cast<int32_t>(reduce_total_length));
std::array<index_t, 2> arrInLengths_1;
std::array<index_t, 2> arrInStrides_1;
std::array<index_t, 1> arrInLengths_2;
std::array<index_t, 1> arrInStrides_2;
std::array<index_t, 1> arrOutLengths;
std::array<index_t, 1> arrOutStrides;
ck::ranges::copy(inLengths_1, arrInLengths_1.begin());
ck::ranges::copy(inStrides_1, arrInStrides_1.begin());
ck::ranges::copy(inLengths_2, arrInLengths_2.begin());
ck::ranges::copy(inStrides_2, arrInStrides_2.begin());
ck::ranges::copy(outLengths, arrOutLengths.begin());
ck::ranges::copy(outStrides, arrOutStrides.begin());
if(do_verify)
{
using ReferenceReduceInstance =
ck::tensor_operation::host::ReferenceReduce<InOutDataType,
AccDataType,
InOutDataType,
2,
2,
ReduceOperation,
InElementwiseOperation,
AccElementwiseOperation,
PropagateNan,
OutputIndex>;
auto reduce_ref = ReferenceReduceInstance{};
auto argument_ptr_ref = reduce_ref.MakeArgumentPointer(arrInLengths_1,
arrInStrides_1,
arrOutLengths,
arrOutStrides,
reduceDims,
static_cast<double>(alpha),
static_cast<double>(beta),
in_1.mData.data(),
nullptr,
out_ref.mData.data(),
nullptr,
in_elementwise_op,
acc_elementwise_op);
if(!reduce_ref.IsSupportedArgument(argument_ptr_ref.get()))
{
std::cout << "The runtime parameters not supported by the reduce reference, exiting!"
<< std::endl;
return (false);
};
auto invoker_ptr_ref = reduce_ref.MakeInvokerPointer();
invoker_ptr_ref->Run(argument_ptr_ref.get());
};
auto reduce_1 = DeviceReduceInstance_1{};
auto argument_ptr_1 = reduce_1.MakeArgumentPointer(arrInLengths_1,
arrInStrides_1,
arrInLengths_2,
arrInStrides_2,
reduceDims_1,
1.0,
0.0,
in_1_dev.GetDeviceBuffer(),
nullptr,
in_2_dev.GetDeviceBuffer(),
nullptr,
in_elementwise_op,
PassThroughOp{});
if(!reduce_1.IsSupportedArgument(argument_ptr_1.get()))
{
std::cout << "The runtime parameters seems supported by the DeviceReduce instance, exiting!"
<< std::endl;
};
auto invoker_ptr_1 = reduce_1.MakeInvokerPointer();
auto reduce_2 = DeviceReduceInstance_2{};
auto argument_ptr_2 = reduce_2.MakeArgumentPointer(arrInLengths_2,
arrInStrides_2,
arrOutLengths,
arrOutStrides,
reduceDims_2,
static_cast<double>(alpha),
static_cast<double>(beta),
in_2_dev.GetDeviceBuffer(),
nullptr,
out_dev.GetDeviceBuffer(),
nullptr,
PassThroughOp{},
acc_elementwise_op);
if(!reduce_2.IsSupportedArgument(argument_ptr_2.get()))
{
std::cout
<< "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
<< std::endl;
};
auto invoker_ptr_2 = reduce_2.MakeInvokerPointer();
float avg_time_1 = invoker_ptr_1->Run(argument_ptr_1.get(), StreamConfig{nullptr, time_kernel});
float avg_time_2 = invoker_ptr_2->Run(argument_ptr_2.get(), StreamConfig{nullptr, time_kernel});
std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InOutDataType) +
invariant_total_length * sizeof(InOutDataType);
float gb_per_sec = num_bytes / 1.E6 / (avg_time_1 + avg_time_2);
std::cout << "Perf: " << avg_time_1 + avg_time_2 << " ms, " << gb_per_sec << " GB/s, "
<< reduce_1.GetTypeString() << " => " << reduce_2.GetTypeString() << std::endl;
bool pass = true;
if(do_verify)
{
out_dev.FromDevice(out.mData.data());
pass = pass && ck::utils::check_err(out, out_ref);
};
return (pass ? 0 : 1);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment