"vscode:/vscode.git/clone" did not exist on "86da45bc66ee7ca782d5f498b0cacb490051d0f6"
Commit 1e95a6e2 authored by carlushuang's avatar carlushuang
Browse files

update script

parent 50ba9c44
...@@ -17,6 +17,10 @@ ...@@ -17,6 +17,10 @@
#define TEST_ELEMENTWISE_VERBOSE 1 #define TEST_ELEMENTWISE_VERBOSE 1
#endif #endif
#ifndef TEST_ELEMENTWISE_HIPGRAPH
#define TEST_ELEMENTWISE_HIPGRAPH 1
#endif
template <typename T> template <typename T>
void dump_host_tensor_2d(const ck_tile::HostTensor<T>& x) void dump_host_tensor_2d(const ck_tile::HostTensor<T>& x)
{ {
...@@ -149,6 +153,13 @@ bool test_cast(ck_tile::ArgParser args) ...@@ -149,6 +153,13 @@ bool test_cast(ck_tile::ArgParser args)
t_.input_type = input_prec; t_.input_type = input_prec;
t_.output_type = output_prec; t_.output_type = output_prec;
t_.op = std::string("cast"); t_.op = std::string("cast");
t_.num_cu = [&]() {
hipDeviceProp_t dev_prop;
hipDevice_t dev;
HIP_CHECK_ERROR(hipGetDevice(&dev));
HIP_CHECK_ERROR(hipGetDeviceProperties(&dev_prop, dev));
return dev_prop.multiProcessorCount;
}();
return t_; return t_;
}(); }();
...@@ -161,11 +172,65 @@ bool test_cast(ck_tile::ArgParser args) ...@@ -161,11 +172,65 @@ bool test_cast(ck_tile::ArgParser args)
}(); }();
#if TEST_ELEMENTWISE_VERBOSE #if TEST_ELEMENTWISE_VERBOSE
ck_tile::stream_config sc{nullptr, true}; #if !TEST_ELEMENTWISE_HIPGRAPH
ck_tile::stream_config sc{nullptr, true, 0, 20, 50, false};
// ck_tile::stream_config sc{nullptr}; // ck_tile::stream_config sc{nullptr};
auto ms = elementwise(trait, karg, sc); auto ms = elementwise(trait, karg, sc);
#else
float ms = 0;
{
int repeat = 50;
int warpup = 20;
hipGraph_t graph_;
hipStream_t stream_;
HIP_CHECK_ERROR(hipStreamCreate(&stream_));
ck_tile::stream_config sc{stream_};
HIP_CHECK_ERROR(hipStreamBeginCapture(sc.stream_id_, hipStreamCaptureModeGlobal));
for(int i_r = 0; i_r < repeat; i_r++) {
elementwise(trait, karg, sc);
}
HIP_CHECK_ERROR(hipStreamEndCapture(sc.stream_id_, &graph_));
hipGraphExec_t instance_;
HIP_CHECK_ERROR(hipGraphInstantiate(&instance_, graph_, nullptr, nullptr, 0));
hipEvent_t start_, stop_;
HIP_CHECK_ERROR(hipEventCreate(&start_));
HIP_CHECK_ERROR(hipEventCreate(&stop_));
//warm-up
for(int i_r = 0; i_r < warpup; i_r++) {
elementwise(trait, karg, sc);
}
HIP_CHECK_ERROR(hipDeviceSynchronize());
HIP_CHECK_ERROR(hipEventRecord(start_, sc.stream_id_));
HIP_CHECK_ERROR(hipGraphLaunch(instance_, sc.stream_id_));
HIP_CHECK_ERROR(hipEventRecord(stop_, sc.stream_id_));
HIP_CHECK_ERROR(hipEventSynchronize(stop_));
HIP_CHECK_ERROR(hipGetLastError());
HIP_CHECK_ERROR(hipGraphDestroy(graph_));
float total_time = 0;
HIP_CHECK_ERROR(hipEventElapsedTime(&total_time, start_, stop_));
ms = total_time / repeat;
}
#endif
auto gbps = [&](){
double total_bytes = num_pixels * sizeof(SrcType) + num_pixels * sizeof(DstType);
return total_bytes / 1.E6 / ms;
}();
printf( printf(
"[cast] %s->%s, n:%lu, ms:%f, ", input_prec.c_str(), output_prec.c_str(), num_pixels, ms); "[cast] %s->%s, n:%lu, ns:%f(ms:%f), %.2fGB/s, ", input_prec.c_str(), output_prec.c_str(), num_pixels, ms*1e6, ms, gbps);
if(ms < 0) if(ms < 0)
printf("not supported\n"); printf("not supported\n");
fflush(stdout); fflush(stdout);
......
...@@ -11,12 +11,12 @@ struct Cast ...@@ -11,12 +11,12 @@ struct Cast
}; };
} // namespace impl } // namespace impl
#define DISPATCH_ELEMENTWISE_CAST(d_type_, s_type_, byte_per_issue_, chunks_) \ #define DISPATCH_E_CAST_(d_type_, s_type_, byte_per_issue_, chunks_, bs_) \
using src_t = s_type_; \ using src_t = s_type_; \
using dst_t = d_type_; \ using dst_t = d_type_; \
using u_fun = typename impl::Cast; \ using u_fun = typename impl::Cast; \
using problem = \ using problem = \
ck_tile::ElementwiseUnaryWarpPerRowProblem<src_t, dst_t, u_fun, byte_per_issue_, chunks_>; \ ck_tile::ElementwiseUnaryWarpPerRowProblem<src_t, dst_t, u_fun, byte_per_issue_, chunks_, bs_>; \
using pipeline = ck_tile::ElementwiseUnaryipeline<problem>; \ using pipeline = ck_tile::ElementwiseUnaryipeline<problem>; \
using kernel = ck_tile::ElementwiseUnaryKernel<pipeline>; \ using kernel = ck_tile::ElementwiseUnaryKernel<pipeline>; \
\ \
...@@ -25,7 +25,7 @@ struct Cast ...@@ -25,7 +25,7 @@ struct Cast
constexpr dim3 blocks = kernel::BlockSize(); \ constexpr dim3 blocks = kernel::BlockSize(); \
\ \
float ave_time = ck_tile::launch_kernel( \ float ave_time = ck_tile::launch_kernel( \
s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs.p_input, kargs.p_output, kargs.num_pixels)); \ s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs.p_input, kargs.p_output, kargs.num_pixels)); \
return ave_time; return ave_time;
float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_config s) float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_config s)
...@@ -35,11 +35,51 @@ float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_conf ...@@ -35,11 +35,51 @@ float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_conf
{ {
if(t.output_type == "fp32" && t.input_type == "fp16") if(t.output_type == "fp32" && t.input_type == "fp16")
{ {
DISPATCH_ELEMENTWISE_CAST(float, ck_tile::fp16_t, 8*sizeof(ck_tile::fp16_t), 8) constexpr int eb = sizeof(ck_tile::fp16_t);
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64)) {
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1*eb, 1, 64)
}
else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128)) {
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1*eb, 1, 128)
}
else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3)) {
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1*eb, 1, 256)
}
else if (a.num_pixels % 4 == 0) {
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8)) {
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 4 * eb, 1, 256)
}
else {
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 4 * eb, 8, 256)
}
}
else {
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1 * eb, 1, 256)
}
} }
else if(t.output_type == "fp16" && t.input_type == "fp32") else if(t.output_type == "fp16" && t.input_type == "fp32")
{ {
DISPATCH_ELEMENTWISE_CAST(ck_tile::fp16_t, float, 4*sizeof(float), 8) constexpr int eb = sizeof(float);
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64)) {
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 64)
}
else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128)) {
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 128)
}
else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3)) {
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 256)
}
else if (a.num_pixels % 4 == 0) {
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8)) {
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 4 * eb, 1, 256)
}
else {
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 4 * eb, 8, 256)
}
}
else {
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 256)
}
} }
} }
return rtn; return rtn;
......
...@@ -13,6 +13,7 @@ struct elementwise_trait ...@@ -13,6 +13,7 @@ struct elementwise_trait
std::string acc_type; // type to do intermediate computation std::string acc_type; // type to do intermediate computation
std::string output_type; // type to store out std::string output_type; // type to store out
std::string op; std::string op;
int num_cu;
}; };
struct elementwise_kargs : public ck_tile::ElementwiseUnaryHostArgs struct elementwise_kargs : public ck_tile::ElementwiseUnaryHostArgs
......
#!/bin/sh
EXE=./build/bin/tile_example_elementwise
$EXE -pr_i=fp16 -pr_o=fp32 -n=2043904
$EXE -pr_i=fp16 -pr_o=fp32 -n=992256
$EXE -pr_i=fp16 -pr_o=fp32 -n=846304
$EXE -pr_i=fp16 -pr_o=fp32 -n=434176
$EXE -pr_i=fp16 -pr_o=fp32 -n=159424
$EXE -pr_i=fp16 -pr_o=fp32 -n=98304
$EXE -pr_i=fp16 -pr_o=fp32 -n=73728
$EXE -pr_i=fp16 -pr_o=fp32 -n=17408
$EXE -pr_i=fp16 -pr_o=fp32 -n=512
$EXE -pr_i=fp16 -pr_o=fp32 -n=256
echo "-------------------------------------"
$EXE -pr_i=fp32 -pr_o=fp16 -n=2043904
$EXE -pr_i=fp32 -pr_o=fp16 -n=992256
$EXE -pr_i=fp32 -pr_o=fp16 -n=846304
$EXE -pr_i=fp32 -pr_o=fp16 -n=434176
$EXE -pr_i=fp32 -pr_o=fp16 -n=159424
$EXE -pr_i=fp32 -pr_o=fp16 -n=98304
$EXE -pr_i=fp32 -pr_o=fp16 -n=73728
$EXE -pr_i=fp32 -pr_o=fp16 -n=17408
$EXE -pr_i=fp32 -pr_o=fp16 -n=512
$EXE -pr_i=fp32 -pr_o=fp16 -n=256
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment