Commit f9c478e2 authored by ltqin's avatar ltqin
Browse files

Merge branch 'develop' into bmatrix_skip_lds

parents 7d85d04a 91d8b7d6
...@@ -36,8 +36,8 @@ int profile_gemm_bias_2d(int argc, char* argv[]) ...@@ -36,8 +36,8 @@ int profile_gemm_bias_2d(int argc, char* argv[])
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n"); printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n"); printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n"); printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: run kernel # of times (>1)\n"); printf("arg7: time kernel (0=n0, 1=yes)\n");
printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n"); printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
printf("arg14: alpha\n"); printf("arg14: alpha\n");
printf("arg15: beta\n"); printf("arg15: beta\n");
...@@ -50,7 +50,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) ...@@ -50,7 +50,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
const bool do_verification = std::stoi(argv[4]); const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]); const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]); const bool do_log = std::stoi(argv[6]);
const int nrepeat = std::stoi(argv[7]); const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]); const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]); const int N = std::stoi(argv[9]);
...@@ -76,7 +76,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) ...@@ -76,7 +76,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification, do_verification,
init_method, init_method,
do_log, do_log,
nrepeat, time_kernel,
M, M,
N, N,
K, K,
...@@ -99,7 +99,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) ...@@ -99,7 +99,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification, do_verification,
init_method, init_method,
do_log, do_log,
nrepeat, time_kernel,
M, M,
N, N,
K, K,
...@@ -122,7 +122,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) ...@@ -122,7 +122,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification, do_verification,
init_method, init_method,
do_log, do_log,
nrepeat, time_kernel,
M, M,
N, N,
K, K,
...@@ -145,7 +145,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) ...@@ -145,7 +145,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification, do_verification,
init_method, init_method,
do_log, do_log,
nrepeat, time_kernel,
M, M,
N, N,
K, K,
...@@ -168,7 +168,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) ...@@ -168,7 +168,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification, do_verification,
init_method, init_method,
do_log, do_log,
nrepeat, time_kernel,
M, M,
N, N,
K, K,
...@@ -191,7 +191,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) ...@@ -191,7 +191,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification, do_verification,
init_method, init_method,
do_log, do_log,
nrepeat, time_kernel,
M, M,
N, N,
K, K,
...@@ -214,7 +214,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) ...@@ -214,7 +214,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification, do_verification,
init_method, init_method,
do_log, do_log,
nrepeat, time_kernel,
M, M,
N, N,
K, K,
...@@ -237,7 +237,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) ...@@ -237,7 +237,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification, do_verification,
init_method, init_method,
do_log, do_log,
nrepeat, time_kernel,
M, M,
N, N,
K, K,
...@@ -252,5 +252,5 @@ int profile_gemm_bias_2d(int argc, char* argv[]) ...@@ -252,5 +252,5 @@ int profile_gemm_bias_2d(int argc, char* argv[])
throw std::runtime_error("wrong! this data_type & layout is not implemented"); throw std::runtime_error("wrong! this data_type & layout is not implemented");
} }
return 1; return 0;
} }
...@@ -36,8 +36,8 @@ int profile_gemm_bias_relu(int argc, char* argv[]) ...@@ -36,8 +36,8 @@ int profile_gemm_bias_relu(int argc, char* argv[])
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n"); printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n"); printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n"); printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: run kernel # of times (>1)\n"); printf("arg7: time kernel (0=n0, 1=yes)\n");
printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n"); printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
printf("arg14: split k into mulitiple batch\n"); printf("arg14: split k into mulitiple batch\n");
exit(1); exit(1);
...@@ -48,7 +48,7 @@ int profile_gemm_bias_relu(int argc, char* argv[]) ...@@ -48,7 +48,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
const bool do_verification = std::stoi(argv[4]); const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]); const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]); const bool do_log = std::stoi(argv[6]);
const int nrepeat = std::stoi(argv[7]); const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]); const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]); const int N = std::stoi(argv[9]);
...@@ -69,7 +69,7 @@ int profile_gemm_bias_relu(int argc, char* argv[]) ...@@ -69,7 +69,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
do_verification, do_verification,
init_method, init_method,
do_log, do_log,
nrepeat, time_kernel,
M, M,
N, N,
K, K,
...@@ -88,7 +88,7 @@ int profile_gemm_bias_relu(int argc, char* argv[]) ...@@ -88,7 +88,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
do_verification, do_verification,
init_method, init_method,
do_log, do_log,
nrepeat, time_kernel,
M, M,
N, N,
K, K,
...@@ -107,7 +107,7 @@ int profile_gemm_bias_relu(int argc, char* argv[]) ...@@ -107,7 +107,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
do_verification, do_verification,
init_method, init_method,
do_log, do_log,
nrepeat, time_kernel,
M, M,
N, N,
K, K,
...@@ -126,7 +126,7 @@ int profile_gemm_bias_relu(int argc, char* argv[]) ...@@ -126,7 +126,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
do_verification, do_verification,
init_method, init_method,
do_log, do_log,
nrepeat, time_kernel,
M, M,
N, N,
K, K,
...@@ -139,5 +139,5 @@ int profile_gemm_bias_relu(int argc, char* argv[]) ...@@ -139,5 +139,5 @@ int profile_gemm_bias_relu(int argc, char* argv[])
throw std::runtime_error("wrong! this data_type & layout is not implemented"); throw std::runtime_error("wrong! this data_type & layout is not implemented");
} }
return 1; return 0;
} }
...@@ -36,8 +36,8 @@ int profile_gemm_bias_relu_add(int argc, char* argv[]) ...@@ -36,8 +36,8 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n"); printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n"); printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n"); printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: run kernel # of times (>1)\n"); printf("arg7: time kernel (0=n0, 1=yes)\n");
printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, StrideC1\n"); printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, StrideC1\n");
printf("arg15: split k into mulitiple batch\n"); printf("arg15: split k into mulitiple batch\n");
exit(1); exit(1);
...@@ -48,7 +48,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[]) ...@@ -48,7 +48,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
const bool do_verification = std::stoi(argv[4]); const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]); const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]); const bool do_log = std::stoi(argv[6]);
const int nrepeat = std::stoi(argv[7]); const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]); const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]); const int N = std::stoi(argv[9]);
...@@ -70,7 +70,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[]) ...@@ -70,7 +70,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
do_verification, do_verification,
init_method, init_method,
do_log, do_log,
nrepeat, time_kernel,
M, M,
N, N,
K, K,
...@@ -90,7 +90,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[]) ...@@ -90,7 +90,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
do_verification, do_verification,
init_method, init_method,
do_log, do_log,
nrepeat, time_kernel,
M, M,
N, N,
K, K,
...@@ -110,7 +110,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[]) ...@@ -110,7 +110,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
do_verification, do_verification,
init_method, init_method,
do_log, do_log,
nrepeat, time_kernel,
M, M,
N, N,
K, K,
...@@ -130,7 +130,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[]) ...@@ -130,7 +130,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
do_verification, do_verification,
init_method, init_method,
do_log, do_log,
nrepeat, time_kernel,
M, M,
N, N,
K, K,
...@@ -144,5 +144,5 @@ int profile_gemm_bias_relu_add(int argc, char* argv[]) ...@@ -144,5 +144,5 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
throw std::runtime_error("wrong! this data_type & layout is not implemented"); throw std::runtime_error("wrong! this data_type & layout is not implemented");
} }
return 1; return 0;
} }
...@@ -32,8 +32,8 @@ int profile_gemm_reduce(int argc, char* argv[]) ...@@ -32,8 +32,8 @@ int profile_gemm_reduce(int argc, char* argv[])
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n"); printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n"); printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n"); printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: run kernel # of times (>1)\n"); printf("arg7: time kernel (0=n0, 1=yes)\n");
printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n"); printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
printf("arg14: split k into mulitiple batch\n"); printf("arg14: split k into mulitiple batch\n");
exit(1); exit(1);
...@@ -44,7 +44,7 @@ int profile_gemm_reduce(int argc, char* argv[]) ...@@ -44,7 +44,7 @@ int profile_gemm_reduce(int argc, char* argv[])
const bool do_verification = std::stoi(argv[4]); const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]); const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]); const bool do_log = std::stoi(argv[6]);
const int nrepeat = std::stoi(argv[7]); const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]); const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]); const int N = std::stoi(argv[9]);
...@@ -66,7 +66,7 @@ int profile_gemm_reduce(int argc, char* argv[]) ...@@ -66,7 +66,7 @@ int profile_gemm_reduce(int argc, char* argv[])
do_verification, do_verification,
init_method, init_method,
do_log, do_log,
nrepeat, time_kernel,
M, M,
N, N,
K, K,
...@@ -87,7 +87,7 @@ int profile_gemm_reduce(int argc, char* argv[]) ...@@ -87,7 +87,7 @@ int profile_gemm_reduce(int argc, char* argv[])
do_verification, do_verification,
init_method, init_method,
do_log, do_log,
nrepeat, time_kernel,
M, M,
N, N,
K, K,
...@@ -108,7 +108,7 @@ int profile_gemm_reduce(int argc, char* argv[]) ...@@ -108,7 +108,7 @@ int profile_gemm_reduce(int argc, char* argv[])
do_verification, do_verification,
init_method, init_method,
do_log, do_log,
nrepeat, time_kernel,
M, M,
N, N,
K, K,
...@@ -129,7 +129,7 @@ int profile_gemm_reduce(int argc, char* argv[]) ...@@ -129,7 +129,7 @@ int profile_gemm_reduce(int argc, char* argv[])
do_verification, do_verification,
init_method, init_method,
do_log, do_log,
nrepeat, time_kernel,
M, M,
N, N,
K, K,
...@@ -142,5 +142,5 @@ int profile_gemm_reduce(int argc, char* argv[]) ...@@ -142,5 +142,5 @@ int profile_gemm_reduce(int argc, char* argv[])
throw std::runtime_error("wrong! this data_type & layout is not implemented"); throw std::runtime_error("wrong! this data_type & layout is not implemented");
} }
return 1; return 0;
} }
...@@ -54,8 +54,8 @@ int profile_grouped_gemm(int argc, char* argv[]) ...@@ -54,8 +54,8 @@ int profile_grouped_gemm(int argc, char* argv[])
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n"); printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n"); printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n"); printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: run kernel # of times (>1)\n"); printf("arg7: time kernel (0=n0, 1=yes)\n");
printf("arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 " printf("arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
"64,64 64,64 128,128)\n"); "64,64 64,64 128,128)\n");
exit(1); exit(1);
...@@ -66,7 +66,7 @@ int profile_grouped_gemm(int argc, char* argv[]) ...@@ -66,7 +66,7 @@ int profile_grouped_gemm(int argc, char* argv[])
const bool do_verification = std::stoi(argv[4]); const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]); const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]); const bool do_log = std::stoi(argv[6]);
const int nrepeat = std::stoi(argv[7]); const bool time_kernel = std::stoi(argv[7]);
const auto Ms = argToIntArray(argv[8]); const auto Ms = argToIntArray(argv[8]);
const auto Ns = argToIntArray(argv[9]); const auto Ns = argToIntArray(argv[9]);
...@@ -79,6 +79,7 @@ int profile_grouped_gemm(int argc, char* argv[]) ...@@ -79,6 +79,7 @@ int profile_grouped_gemm(int argc, char* argv[])
if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN) if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
ck::profiler::profile_grouped_gemm_impl<ck::half_t, ck::profiler::profile_grouped_gemm_impl<ck::half_t,
ck::half_t,
ck::half_t, ck::half_t,
ck::half_t, ck::half_t,
ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor,
...@@ -86,7 +87,7 @@ int profile_grouped_gemm(int argc, char* argv[]) ...@@ -86,7 +87,7 @@ int profile_grouped_gemm(int argc, char* argv[])
ck::tensor_layout::gemm::RowMajor>(do_verification, ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method, init_method,
do_log, do_log,
nrepeat, time_kernel,
Ms, Ms,
Ns, Ns,
Ks, Ks,
...@@ -97,6 +98,7 @@ int profile_grouped_gemm(int argc, char* argv[]) ...@@ -97,6 +98,7 @@ int profile_grouped_gemm(int argc, char* argv[])
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN) else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{ {
ck::profiler::profile_grouped_gemm_impl<ck::half_t, ck::profiler::profile_grouped_gemm_impl<ck::half_t,
ck::half_t,
ck::half_t, ck::half_t,
ck::half_t, ck::half_t,
ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor,
...@@ -104,7 +106,7 @@ int profile_grouped_gemm(int argc, char* argv[]) ...@@ -104,7 +106,7 @@ int profile_grouped_gemm(int argc, char* argv[])
ck::tensor_layout::gemm::RowMajor>(do_verification, ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method, init_method,
do_log, do_log,
nrepeat, time_kernel,
Ms, Ms,
Ns, Ns,
Ks, Ks,
...@@ -115,6 +117,7 @@ int profile_grouped_gemm(int argc, char* argv[]) ...@@ -115,6 +117,7 @@ int profile_grouped_gemm(int argc, char* argv[])
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN) else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
{ {
ck::profiler::profile_grouped_gemm_impl<ck::half_t, ck::profiler::profile_grouped_gemm_impl<ck::half_t,
ck::half_t,
ck::half_t, ck::half_t,
ck::half_t, ck::half_t,
ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::ColumnMajor,
...@@ -122,7 +125,7 @@ int profile_grouped_gemm(int argc, char* argv[]) ...@@ -122,7 +125,7 @@ int profile_grouped_gemm(int argc, char* argv[])
ck::tensor_layout::gemm::RowMajor>(do_verification, ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method, init_method,
do_log, do_log,
nrepeat, time_kernel,
Ms, Ms,
Ns, Ns,
Ks, Ks,
...@@ -133,6 +136,7 @@ int profile_grouped_gemm(int argc, char* argv[]) ...@@ -133,6 +136,7 @@ int profile_grouped_gemm(int argc, char* argv[])
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN) else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
{ {
ck::profiler::profile_grouped_gemm_impl<ck::half_t, ck::profiler::profile_grouped_gemm_impl<ck::half_t,
ck::half_t,
ck::half_t, ck::half_t,
ck::half_t, ck::half_t,
ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::ColumnMajor,
...@@ -140,7 +144,7 @@ int profile_grouped_gemm(int argc, char* argv[]) ...@@ -140,7 +144,7 @@ int profile_grouped_gemm(int argc, char* argv[])
ck::tensor_layout::gemm::RowMajor>(do_verification, ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method, init_method,
do_log, do_log,
nrepeat, time_kernel,
Ms, Ms,
Ns, Ns,
Ks, Ks,
...@@ -153,5 +157,5 @@ int profile_grouped_gemm(int argc, char* argv[]) ...@@ -153,5 +157,5 @@ int profile_grouped_gemm(int argc, char* argv[])
throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented"); throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
} }
return 1; return 0;
} }
#include <iostream> #include <iostream>
#include <fstream> #include <fstream>
#include <numeric>
#include <initializer_list>
#include <cstdlib> #include <cstdlib>
#include <vector> #include <vector>
#include <stdexcept> #include <stdexcept>
#include <sstream> #include <sstream>
#include <getopt.h> #include <getopt.h>
#include "config.hpp" #include "data_type_enum.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "reduction_enums.hpp" #include "reduction_enums.hpp"
#include "host_common_util.hpp"
#include "profile_reduce_impl.hpp" #include "profile_reduce_impl.hpp"
using namespace std; using namespace std;
using ck::NanPropagation;
using ck::ReduceTensorIndices;
using ck::ReduceTensorOp; using ck::ReduceTensorOp;
static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'}, static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
...@@ -38,63 +30,9 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr, ...@@ -38,63 +30,9 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,
{"bf16", no_argument, nullptr, '?'}, {"bf16", no_argument, nullptr, '?'},
{"dumpout", required_argument, nullptr, 'o'}, {"dumpout", required_argument, nullptr, 'o'},
{"verify", required_argument, nullptr, 'v'}, {"verify", required_argument, nullptr, 'v'},
{"log", required_argument, nullptr, 'l'},
{"help", no_argument, nullptr, '?'}, {"help", no_argument, nullptr, '?'},
{nullptr, 0, nullptr, 0}}; {nullptr, 0, nullptr, 0}};
template <typename T>
static T getSingleValueFromString(const string& valueStr)
{
std::istringstream iss(valueStr);
T val;
iss >> val;
return (val);
};
template <typename T>
static std::vector<T> getTypeValuesFromString(const char* cstr_values)
{
std::string valuesStr(cstr_values);
std::vector<T> values;
std::size_t pos = 0;
std::size_t new_pos;
new_pos = valuesStr.find(',', pos);
while(new_pos != std::string::npos)
{
const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);
T val = getSingleValueFromString<T>(sliceStr);
values.push_back(val);
pos = new_pos + 1;
new_pos = valuesStr.find(',', pos);
};
std::string sliceStr = valuesStr.substr(pos);
T val = getSingleValueFromString<T>(sliceStr);
values.push_back(val);
return (values);
}
enum struct AppDataType
{
appHalf = 0,
appFloat = 1,
appInt32 = 2,
appInt8 = 3,
appInt8x4 = 4,
appBFloat16 = 5,
appDouble = 6,
};
static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims) static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims)
{ {
for(auto dim : reduceDims) for(auto dim : reduceDims)
...@@ -113,7 +51,7 @@ static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims ...@@ -113,7 +51,7 @@ static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims
}; };
}; };
class AppArgs class ReduceProfilerArgs
{ {
private: private:
int option_index = 0; int option_index = 0;
...@@ -130,26 +68,23 @@ class AppArgs ...@@ -130,26 +68,23 @@ class AppArgs
std::vector<float> scales; std::vector<float> scales;
ReduceTensorOp reduceOp = ReduceTensorOp::ADD; ReduceTensorOp reduceOp = ReduceTensorOp::ADD;
AppDataType compTypeId = AppDataType::appFloat; ck::DataTypeEnum compTypeId = ck::DataTypeEnum::Float;
AppDataType outTypeId = AppDataType::appFloat; ck::DataTypeEnum outTypeId = ck::DataTypeEnum::Float;
bool compType_assigned = false; bool compType_assigned = false;
bool outType_assigned = false; bool outType_assigned = false;
NanPropagation nanOpt = NanPropagation::NOT_PROPAGATE_NAN; int nanOpt = 0;
ReduceTensorIndices indicesOpt = ReduceTensorIndices::NO_INDICES; int indicesOpt = 0;
bool do_log = false; bool do_verification = false;
bool do_verification = false; bool do_dumpout = false;
bool do_dumpout = false;
int init_method; int init_method;
int nrepeat; bool time_kernel;
bool need_indices = false; ReduceProfilerArgs() = default;
~ReduceProfilerArgs() = default;
AppArgs() = default;
~AppArgs() = default;
void show_usage(const char* cmd) void show_usage(const char* cmd)
{ {
...@@ -166,8 +101,11 @@ class AppArgs ...@@ -166,8 +101,11 @@ class AppArgs
std::cout << "--outType or -W, optional enum value indicating the type of the reduced " std::cout << "--outType or -W, optional enum value indicating the type of the reduced "
"output, which could be float when the input data is half" "output, which could be float when the input data is half"
<< std::endl; << std::endl;
std::cout << "--nanOpt or -N, enum value indicates the selection for NanOpt" << std::endl; std::cout
std::cout << "--indicesOpt or -I, enum value indicates the selection for IndicesOpt" << "--nanOpt or -N, 1/0 value indicates the selection to use or not use Nan-Propagation"
<< std::endl;
std::cout << "--indicesOpt or -I, 1/0 value indicates the selection to use or not use "
"index in reduction"
<< std::endl; << std::endl;
std::cout << "--scales or -S, comma separated two float values for alpha and beta" std::cout << "--scales or -S, comma separated two float values for alpha and beta"
<< std::endl; << std::endl;
...@@ -181,18 +119,19 @@ class AppArgs ...@@ -181,18 +119,19 @@ class AppArgs
std::cout << "--dumpout or -o, 1/0 to indicate where to save the reduction result to files " std::cout << "--dumpout or -o, 1/0 to indicate where to save the reduction result to files "
"for further analysis" "for further analysis"
<< std::endl; << std::endl;
std::cout << "--log or -l, 1/0 to indicate whether to log some information" << std::endl;
}; };
int processArgs(int argc, char* argv[]) int processArgs(int argc, char* argv[])
{ {
unsigned int ch; using ck::host_common::getTypeValuesFromString;
int ch;
optind++; // to skip the "reduce" module name optind++; // to skip the "reduce" module name
while(1) while(1)
{ {
ch = getopt_long(argc, argv, "D:R:O:C:W:N:I:S:v:o:l:", long_options, &option_index); ch = getopt_long(argc, argv, "D:R:O:C:W:N:I:S:v:o:", long_options, &option_index);
if(ch == -1) if(ch == -1)
break; break;
switch(ch) switch(ch)
...@@ -219,27 +158,27 @@ class AppArgs ...@@ -219,27 +158,27 @@ class AppArgs
if(!optarg) if(!optarg)
throw std::runtime_error("Invalid option format!"); throw std::runtime_error("Invalid option format!");
compTypeId = static_cast<AppDataType>(std::atoi(optarg)); compTypeId = static_cast<ck::DataTypeEnum>(std::atoi(optarg));
compType_assigned = true; compType_assigned = true;
break; break;
case 'W': case 'W':
if(!optarg) if(!optarg)
throw std::runtime_error("Invalid option format!"); throw std::runtime_error("Invalid option format!");
outTypeId = static_cast<AppDataType>(std::atoi(optarg)); outTypeId = static_cast<ck::DataTypeEnum>(std::atoi(optarg));
outType_assigned = true; outType_assigned = true;
break; break;
case 'N': case 'N':
if(!optarg) if(!optarg)
throw std::runtime_error("Invalid option format!"); throw std::runtime_error("Invalid option format!");
nanOpt = static_cast<NanPropagation>(std::atoi(optarg)); nanOpt = std::atoi(optarg);
break; break;
case 'I': case 'I':
if(!optarg) if(!optarg)
throw std::runtime_error("Invalid option format!"); throw std::runtime_error("Invalid option format!");
indicesOpt = static_cast<ReduceTensorIndices>(std::atoi(optarg)); indicesOpt = std::atoi(optarg);
break; break;
case 'S': case 'S':
if(!optarg) if(!optarg)
...@@ -262,12 +201,6 @@ class AppArgs ...@@ -262,12 +201,6 @@ class AppArgs
do_dumpout = static_cast<bool>(std::atoi(optarg)); do_dumpout = static_cast<bool>(std::atoi(optarg));
break; break;
case 'l':
if(!optarg)
throw std::runtime_error("Invalid option format!");
do_log = static_cast<bool>(std::atoi(optarg));
break;
case '?': case '?':
if(std::string(long_options[option_index].name) == "half") if(std::string(long_options[option_index].name) == "half")
use_half = true; use_half = true;
...@@ -295,7 +228,7 @@ class AppArgs ...@@ -295,7 +228,7 @@ class AppArgs
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!"); throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
init_method = std::atoi(argv[optind++]); init_method = std::atoi(argv[optind++]);
nrepeat = std::atoi(argv[optind]); time_kernel = static_cast<bool>(std::atoi(argv[optind]));
if(scales.empty()) if(scales.empty())
{ {
...@@ -306,9 +239,6 @@ class AppArgs ...@@ -306,9 +239,6 @@ class AppArgs
if(reduceOp == ReduceTensorOp::MIN || reduceOp == ReduceTensorOp::MAX || if(reduceOp == ReduceTensorOp::MIN || reduceOp == ReduceTensorOp::MAX ||
reduceOp == ReduceTensorOp::AMAX) reduceOp == ReduceTensorOp::AMAX)
{ {
if(indicesOpt != ReduceTensorIndices::NO_INDICES)
need_indices = true;
// for indexable operations, no need to assign compType and outType, just let them be // for indexable operations, no need to assign compType and outType, just let them be
// same as inType // same as inType
compType_assigned = false; compType_assigned = false;
...@@ -322,9 +252,10 @@ class AppArgs ...@@ -322,9 +252,10 @@ class AppArgs
int profile_reduce(int argc, char* argv[]) int profile_reduce(int argc, char* argv[])
{ {
using namespace ck::profiler; using ck::DataTypeEnum;
using ck::profiler::profile_reduce_impl;
AppArgs args; ReduceProfilerArgs args;
if(args.processArgs(argc, argv) < 0) if(args.processArgs(argc, argv) < 0)
return (-1); return (-1);
...@@ -339,42 +270,41 @@ int profile_reduce(int argc, char* argv[]) ...@@ -339,42 +270,41 @@ int profile_reduce(int argc, char* argv[])
if(args.use_half) if(args.use_half)
{ {
if(!args.compType_assigned) if(!args.compType_assigned)
args.compTypeId = AppDataType::appHalf; args.compTypeId = DataTypeEnum::Half;
if(args.outType_assigned && if(args.outType_assigned &&
(args.outTypeId != AppDataType::appHalf && args.outTypeId != AppDataType::appFloat)) (args.outTypeId != DataTypeEnum::Half && args.outTypeId != DataTypeEnum::Float))
args.outTypeId = AppDataType::appFloat; args.outTypeId = DataTypeEnum::Float;
if(!args.outType_assigned) if(!args.outType_assigned)
args.outTypeId = AppDataType::appHalf; args.outTypeId = DataTypeEnum::Half;
if(args.compTypeId == AppDataType::appHalf) if(args.compTypeId == DataTypeEnum::Half)
{ {
profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(args.do_verification, profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(
args.init_method, args.do_verification,
args.do_log, args.init_method,
args.do_dumpout, args.do_dumpout,
args.nrepeat, args.time_kernel,
args.inLengths, args.inLengths,
args.reduceDims, args.reduceDims,
args.reduceOp, args.reduceOp,
args.nanOpt, static_cast<bool>(args.nanOpt),
args.indicesOpt, static_cast<bool>(args.indicesOpt),
args.scales[0], args.scales[0],
args.scales[1]); args.scales[1]);
} }
else if(args.compTypeId == AppDataType::appFloat) else if(args.compTypeId == DataTypeEnum::Float)
{ {
profile_reduce_impl<ck::half_t, float, ck::half_t>(args.do_verification, profile_reduce_impl<ck::half_t, float, ck::half_t>(args.do_verification,
args.init_method, args.init_method,
args.do_log,
args.do_dumpout, args.do_dumpout,
args.nrepeat, args.time_kernel,
args.inLengths, args.inLengths,
args.reduceDims, args.reduceDims,
args.reduceOp, args.reduceOp,
args.nanOpt, static_cast<bool>(args.nanOpt),
args.indicesOpt, static_cast<bool>(args.indicesOpt),
args.scales[0], args.scales[0],
args.scales[1]); args.scales[1]);
} }
...@@ -385,56 +315,53 @@ int profile_reduce(int argc, char* argv[]) ...@@ -385,56 +315,53 @@ int profile_reduce(int argc, char* argv[])
{ {
profile_reduce_impl<double, double, double>(args.do_verification, profile_reduce_impl<double, double, double>(args.do_verification,
args.init_method, args.init_method,
args.do_log,
args.do_dumpout, args.do_dumpout,
args.nrepeat, args.time_kernel,
args.inLengths, args.inLengths,
args.reduceDims, args.reduceDims,
args.reduceOp, args.reduceOp,
args.nanOpt, static_cast<bool>(args.nanOpt),
args.indicesOpt, static_cast<bool>(args.indicesOpt),
args.scales[0], args.scales[0],
args.scales[1]); args.scales[1]);
} }
else if(args.use_int8) else if(args.use_int8)
{ {
if(!args.compType_assigned) if(!args.compType_assigned)
args.compTypeId = AppDataType::appInt8; args.compTypeId = DataTypeEnum::Int8;
if(args.outType_assigned && if(args.outType_assigned &&
(args.outTypeId != AppDataType::appInt8 && args.outTypeId != AppDataType::appInt32)) (args.outTypeId != DataTypeEnum::Int8 && args.outTypeId != DataTypeEnum::Int32))
args.outTypeId = AppDataType::appInt32; args.outTypeId = DataTypeEnum::Int32;
if(!args.outType_assigned) if(!args.outType_assigned)
args.outTypeId = AppDataType::appInt8; args.outTypeId = DataTypeEnum::Int8;
if(args.compTypeId == AppDataType::appInt8) if(args.compTypeId == DataTypeEnum::Int8)
{ {
profile_reduce_impl<int8_t, int8_t, int8_t>(args.do_verification, profile_reduce_impl<int8_t, int8_t, int8_t>(args.do_verification,
args.init_method, args.init_method,
args.do_log,
args.do_dumpout, args.do_dumpout,
args.nrepeat, args.time_kernel,
args.inLengths, args.inLengths,
args.reduceDims, args.reduceDims,
args.reduceOp, args.reduceOp,
args.nanOpt, static_cast<bool>(args.nanOpt),
args.indicesOpt, static_cast<bool>(args.indicesOpt),
args.scales[0], args.scales[0],
args.scales[1]); args.scales[1]);
} }
else if(args.compTypeId == AppDataType::appInt32) else if(args.compTypeId == DataTypeEnum::Int32)
{ {
profile_reduce_impl<int8_t, int32_t, int8_t>(args.do_verification, profile_reduce_impl<int8_t, int32_t, int8_t>(args.do_verification,
args.init_method, args.init_method,
args.do_log,
args.do_dumpout, args.do_dumpout,
args.nrepeat, args.time_kernel,
args.inLengths, args.inLengths,
args.reduceDims, args.reduceDims,
args.reduceOp, args.reduceOp,
args.nanOpt, static_cast<bool>(args.nanOpt),
args.indicesOpt, static_cast<bool>(args.indicesOpt),
args.scales[0], args.scales[0],
args.scales[1]); args.scales[1]);
} }
...@@ -444,54 +371,51 @@ int profile_reduce(int argc, char* argv[]) ...@@ -444,54 +371,51 @@ int profile_reduce(int argc, char* argv[])
else if(args.use_bf16) else if(args.use_bf16)
{ {
if(args.outType_assigned && if(args.outType_assigned &&
(args.outTypeId != AppDataType::appBFloat16 && args.outTypeId != AppDataType::appFloat)) (args.outTypeId != DataTypeEnum::BFloat16 && args.outTypeId != DataTypeEnum::Float))
args.outTypeId = AppDataType::appFloat; args.outTypeId = DataTypeEnum::Float;
if(!args.outType_assigned) if(!args.outType_assigned)
args.outTypeId = AppDataType::appBFloat16; args.outTypeId = DataTypeEnum::BFloat16;
profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(args.do_verification, profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(args.do_verification,
args.init_method, args.init_method,
args.do_log,
args.do_dumpout, args.do_dumpout,
args.nrepeat, args.time_kernel,
args.inLengths, args.inLengths,
args.reduceDims, args.reduceDims,
args.reduceOp, args.reduceOp,
args.nanOpt, static_cast<bool>(args.nanOpt),
args.indicesOpt, static_cast<bool>(args.indicesOpt),
args.scales[0], args.scales[0],
args.scales[1]); args.scales[1]);
} }
else else
{ {
if(args.compTypeId == AppDataType::appFloat) if(args.compTypeId == DataTypeEnum::Float)
{ {
profile_reduce_impl<float, float, float>(args.do_verification, profile_reduce_impl<float, float, float>(args.do_verification,
args.init_method, args.init_method,
args.do_log,
args.do_dumpout, args.do_dumpout,
args.nrepeat, args.time_kernel,
args.inLengths, args.inLengths,
args.reduceDims, args.reduceDims,
args.reduceOp, args.reduceOp,
args.nanOpt, static_cast<bool>(args.nanOpt),
args.indicesOpt, static_cast<bool>(args.indicesOpt),
args.scales[0], args.scales[0],
args.scales[1]); args.scales[1]);
} }
else if(args.compTypeId == AppDataType::appDouble) else if(args.compTypeId == DataTypeEnum::Double)
{ {
profile_reduce_impl<float, double, float>(args.do_verification, profile_reduce_impl<float, double, float>(args.do_verification,
args.init_method, args.init_method,
args.do_log,
args.do_dumpout, args.do_dumpout,
args.nrepeat, args.time_kernel,
args.inLengths, args.inLengths,
args.reduceDims, args.reduceDims,
args.reduceOp, args.reduceOp,
args.nanOpt, static_cast<bool>(args.nanOpt),
args.indicesOpt, static_cast<bool>(args.indicesOpt),
args.scales[0], args.scales[0],
args.scales[1]); args.scales[1]);
} }
......
...@@ -13,6 +13,7 @@ int profile_gemm_bias_relu_add(int, char*[]); ...@@ -13,6 +13,7 @@ int profile_gemm_bias_relu_add(int, char*[]);
int profile_gemm_reduce(int, char*[]); int profile_gemm_reduce(int, char*[]);
int profile_batched_gemm(int, char*[]); int profile_batched_gemm(int, char*[]);
int profile_grouped_gemm(int, char*[]); int profile_grouped_gemm(int, char*[]);
int profile_conv_fwd(int, char*[]);
int profile_conv_fwd_bias_relu(int, char*[]); int profile_conv_fwd_bias_relu(int, char*[]);
int profile_conv_fwd_bias_relu_add(int, char*[]); int profile_conv_fwd_bias_relu_add(int, char*[]);
int profile_conv_fwd_bias_relu_atomic_add(int, char*[]); int profile_conv_fwd_bias_relu_atomic_add(int, char*[]);
...@@ -53,7 +54,7 @@ int main(int argc, char* argv[]) ...@@ -53,7 +54,7 @@ int main(int argc, char* argv[])
} }
else if(strcmp(argv[1], "grouped_gemm") == 0) else if(strcmp(argv[1], "grouped_gemm") == 0)
{ {
profile_grouped_gemm(argc, argv); return profile_grouped_gemm(argc, argv);
} }
else if(strcmp(argv[1], "conv_fwd") == 0) else if(strcmp(argv[1], "conv_fwd") == 0)
{ {
...@@ -107,7 +108,7 @@ int main(int argc, char* argv[]) ...@@ -107,7 +108,7 @@ int main(int argc, char* argv[])
" conv1d_bwd_data: BackwardConvolution data 1 dim\n" " conv1d_bwd_data: BackwardConvolution data 1 dim\n"
" conv2d_bwd_data: BackwardConvolution data 2 dim\n" " conv2d_bwd_data: BackwardConvolution data 2 dim\n"
" conv3d_bwd_data: BackwardConvolution data 3 dim\n" " conv3d_bwd_data: BackwardConvolution data 3 dim\n"
" reduce: REDUCE\n" " reduce: Reduce\n"
" conv2d_bwd_weight: Backward Weight Convolution 2d\n"); " conv2d_bwd_weight: Backward Weight Convolution 2d\n");
// clang-format on // clang-format on
} }
......
#!/usr/bin/env python3
import os, io, argparse, datetime
import numpy as np
import sqlalchemy
from sqlalchemy.types import NVARCHAR, Float, Integer
import pymysql
import pandas as pd
from sshtunnel import SSHTunnelForwarder
def print_to_string(*args, **kwargs):
output = io.StringIO()
print(*args, file=output, **kwargs)
contents = output.getvalue()
output.close()
return contents
def parse_args():
parser = argparse.ArgumentParser(description='Parse results from tf benchmark runs')
parser.add_argument('filename', type=str, help='Log file to prase or directory containing log files')
args = parser.parse_args()
files = []
if os.path.isdir(args.filename):
all_files = os.listdir(args.filename)
for name in all_files:
if not 'log' in name:
continue
files.append(os.path.join(args.filename, name))
else:
files = [args.filename]
args.files = files
return args
def main():
args = parse_args()
tests = []
kernels=[]
tflops=[]
dtype=[]
alayout=[]
blayout=[]
M=[]
N=[]
K=[]
StrideA=[]
StrideB=[]
StrideC=[]
#parse results, get the Tflops value for "Best Perf" kernels
glue=""
for filename in args.files:
for line in open(filename):
if 'Branch name' in line:
lst=line.split()
branch_name=lst[2]
for filename in args.files:
for line in open(filename):
if 'Best Perf' in line:
lst=line.split()
if len(lst)>=37: #the line is complete
tests.append(glue.join(lst[5:30]))
kernels.append(glue.join(lst[37:]))
tflops.append(lst[33])
dtype.append(lst[5])
alayout.append(lst[8])
blayout.append(lst[11])
M.append(lst[14])
N.append(lst[17])
K.append(lst[20])
StrideA.append(lst[23])
StrideB.append(lst[26])
StrideC.append(lst[29])
elif len(lst)<37 and len(lst)>=33: #the tflops are available
tests.append(glue.join(lst[5:30]))
kernels.append("N/A")
tflops.append(lst[33])
dtype.append(lst[5])
alayout.append(lst[8])
blayout.append(lst[11])
M.append(lst[14])
N.append(lst[17])
K.append(lst[20])
StrideA.append(lst[23])
StrideB.append(lst[26])
StrideC.append(lst[29])
print("warning: incomplete line:",lst)
elif len(lst)<33: #even the tflops are not available
print("Error in ckProfiler output!")
print("warning: incomplete line=",lst)
#sort results
print("Number of tests:",len(tests))
print("Branch name:",branch_name)
#sorted_tests = sorted(tests)
#print("sorted tests:",sorted_tests)
sorted_tflops = [x for _,x in sorted(zip(tests,tflops))]
#sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
test_list=list(range(1,len(tests)+1))
sql_hostname = '127.0.0.1'
sql_username = os.environ["dbuser"]
print("sql_username=",sql_username)
sql_password = os.environ["dbpassword"]
sql_main_database = 'miopen_perf'
sql_port = 3306
ssh_host = os.environ["dbsship"]
print("ssh_host=",ssh_host)
ssh_user = os.environ["dbsshuser"]
print("ssh_user=",ssh_user)
ssh_port = int(os.environ["dbsshport"])
ssh_pass = os.environ["dbsshpassword"]
with SSHTunnelForwarder(
(ssh_host, ssh_port),
ssh_username=ssh_user,
ssh_password=ssh_pass,
remote_bind_address=(sql_hostname, sql_port)) as tunnel:
sqlEngine = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}:{3}/{4}'.
format(sql_username, sql_password, sql_hostname, tunnel.local_bind_port, sql_main_database))
conn = sqlEngine.connect()
#write the ck_gemm_test_params table
#only needed once the test set changes
'''
sorted_dtypes = [x for _,x in sorted(zip(tests,dtype))]
sorted_alayout = [x for _,x in sorted(zip(tests,alayout))]
sorted_blayout = [x for _,x in sorted(zip(tests,blayout))]
sorted_M = [x for _,x in sorted(zip(tests,M))]
sorted_N = [x for _,x in sorted(zip(tests,N))]
sorted_K = [x for _,x in sorted(zip(tests,K))]
sorted_StrideA = [x for _,x in sorted(zip(tests,StrideA))]
sorted_StrideB = [x for _,x in sorted(zip(tests,StrideB))]
sorted_StrideC = [x for _,x in sorted(zip(tests,StrideC))]
ck_gemm_params=[test_list,sorted_dtypes,sorted_alayout,sorted_blayout,
sorted_M,sorted_N,sorted_K,sorted_StrideA,sorted_StrideB,
sorted_StrideC]
df=pd.DataFrame(np.transpose(ck_gemm_params),columns=['Test_number','Data_type',
'Alayout','BLayout','M','N','K', 'StrideA','StrideB','StrideC'])
print(df)
dtypes = {
'Test_number': Integer(),
'Data_type': NVARCHAR(length=5),
'Alayout': NVARCHAR(length=12),
'Blayout': NVARCHAR(length=12),
'M': Integer(),
'N': Integer(),
'K': Integer(),
'StrideA': Integer(),
'StrideB': Integer(),
'StrideC': Integer()
}
df.to_sql("ck_gemm_test_params",conn,if_exists='replace',index=False, dtype=dtypes)
'''
#read baseline results for the latest develop branch
query = '''SELECT * from ck_gemm_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_gemm_tflops where Branch_ID='develop' );'''
tflops_base = pd.read_sql_query(query, conn)
#write new results to the db
testlist=[]
for i in range(1,len(tests)+1):
testlist.append("Test%i"%i)
ck_gemm_tflops=[str(branch_name),str(datetime.datetime.now())]
flops=pd.DataFrame(data=[ck_gemm_tflops],columns=['Branch_ID','Datetime'])
df_add=pd.DataFrame(data=[sorted_tflops],columns=testlist)
flops=pd.concat([flops,df_add],axis=1)
print("new tflops results:",flops)
flops.to_sql("ck_gemm_tflops",conn,if_exists='append',index=False)
conn.close()
#compare the results to the baseline
regression=0
base=tflops_base[testlist].to_numpy(dtype='float')
base_list=base[0]
ave_perf=0
for i in range(len(base_list)):
# success criterion:
if base_list[i]>1.01*float(sorted_tflops[i]):
print("test # ",i,"shows regression by {:.3f}%".format(
(float(sorted_tflops[i])-base_list[i])/base_list[i]*100))
regression=1
ave_perf=ave_perf+float(sorted_tflops[i])/base_list[i]
if regression==0:
print("no regressions found")
ave_perf=ave_perf/len(base_list)
print("average performance relative to baseline:",ave_perf)
#return 0 if performance criteria met, otherwise return 1
return regression
if __name__ == '__main__':
main()
\ No newline at end of file
#!/bin/bash #!/bin/bash
## GPU visibility ## GPU visibility
export HIP_VISIBLE_DEVICES=0 export HIP_VISIBLE_DEVICES=0
#make -j ckProfiler
make -j ckProfiler DRIVER="../build/bin/ckProfiler"
echo $DRIVER
DRIVER="./profiler/ckProfiler"
OP=$1 OP=$1
DATATYPE=$2 DATATYPE=$2
LAYOUT=$3 LAYOUT=$3
...@@ -43,3 +41,13 @@ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1088 1 ...@@ -43,3 +41,13 @@ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1088 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2112 2112 2112 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2112 2112 2112
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096 4096 4096 4160 4160 4160 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096 4096 4096 4160 4160 4160
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192 8192 8192 8256 8256 8256 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192 8192 8192 8256 8256 8256
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 6656 8192 8192 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 3328 4096 4096 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1664 2048 2048 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 832 1024 1024 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 7040 8192 8192 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 5120 5632 4096 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2560 2816 2048 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1280 1408 1024 -1 -1 -1
...@@ -15,6 +15,17 @@ bin/test_reduce_no_index -D 64,4,280,82 -R 1 0 2 ...@@ -15,6 +15,17 @@ bin/test_reduce_no_index -D 64,4,280,82 -R 1 0 2
bin/test_reduce_no_index -D 64,4,280,82 -R 2 0 2 bin/test_reduce_no_index -D 64,4,280,82 -R 2 0 2
bin/test_reduce_no_index -D 64,4,280,82 -R 3 0 2 bin/test_reduce_no_index -D 64,4,280,82 -R 3 0 2
## for float64
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 6 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 6 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,3 6 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,2,3 6 2
bin/test_reduce_no_index -D 64,4,280,82 -R 1,2,3 6 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0 6 2
bin/test_reduce_no_index -D 64,4,280,82 -R 1 6 2
bin/test_reduce_no_index -D 64,4,280,82 -R 2 6 2
bin/test_reduce_no_index -D 64,4,280,82 -R 3 6 2
## for float16 ## for float16
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 1 2 bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 1 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 1 2 bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 1 2
......
...@@ -15,6 +15,17 @@ bin/test_reduce_with_index -D 64,4,280,82 -R 1 0 2 ...@@ -15,6 +15,17 @@ bin/test_reduce_with_index -D 64,4,280,82 -R 1 0 2
bin/test_reduce_with_index -D 64,4,280,82 -R 2 0 2 bin/test_reduce_with_index -D 64,4,280,82 -R 2 0 2
bin/test_reduce_with_index -D 64,4,280,82 -R 3 0 2 bin/test_reduce_with_index -D 64,4,280,82 -R 3 0 2
## for float64
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 6 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 6 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,3 6 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,2,3 6 2
bin/test_reduce_with_index -D 64,4,280,82 -R 1,2,3 6 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0 6 2
bin/test_reduce_with_index -D 64,4,280,82 -R 1 6 2
bin/test_reduce_with_index -D 64,4,280,82 -R 2 6 2
bin/test_reduce_with_index -D 64,4,280,82 -R 3 6 2
## for float16 ## for float16
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 1 2 bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 1 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 1 2 bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 1 2
......
include_directories(BEFORE include_directories(BEFORE
${PROJECT_SOURCE_DIR}/
${PROJECT_SOURCE_DIR}/include/ck ${PROJECT_SOURCE_DIR}/include/ck
${PROJECT_SOURCE_DIR}/include/ck/utility ${PROJECT_SOURCE_DIR}/include/ck/utility
${PROJECT_SOURCE_DIR}/include/ck/host_utility
${PROJECT_SOURCE_DIR}/include/ck/tensor_description ${PROJECT_SOURCE_DIR}/include/ck/tensor_description
${PROJECT_SOURCE_DIR}/include/ck/tensor ${PROJECT_SOURCE_DIR}/include/ck/tensor
${PROJECT_SOURCE_DIR}/include/ck/problem_transform ${PROJECT_SOURCE_DIR}/include/ck/problem_transform
...@@ -21,7 +23,8 @@ include_directories(BEFORE ...@@ -21,7 +23,8 @@ include_directories(BEFORE
${PROJECT_SOURCE_DIR}/external/include/half ${PROJECT_SOURCE_DIR}/external/include/half
) )
add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR}) include(googletest)
add_custom_target(tests) add_custom_target(tests)
...@@ -41,7 +44,7 @@ function(add_gtest_executable TEST_NAME) ...@@ -41,7 +44,7 @@ function(add_gtest_executable TEST_NAME)
add_dependencies(tests ${TEST_NAME}) add_dependencies(tests ${TEST_NAME})
add_dependencies(check ${TEST_NAME}) add_dependencies(check ${TEST_NAME})
# suppress gtest warnings # suppress gtest warnings
target_compile_options(${TEST_NAME} PRIVATE -Wno-global-constructors) target_compile_options(${TEST_NAME} PRIVATE -Wno-global-constructors -Wno-undef)
target_link_libraries(${TEST_NAME} PRIVATE gtest_main) target_link_libraries(${TEST_NAME} PRIVATE gtest_main)
gtest_discover_tests(${TEST_NAME}) gtest_discover_tests(${TEST_NAME})
endfunction(add_gtest_executable TEST_NAME) endfunction(add_gtest_executable TEST_NAME)
...@@ -60,3 +63,6 @@ add_subdirectory(grouped_gemm) ...@@ -60,3 +63,6 @@ add_subdirectory(grouped_gemm)
add_subdirectory(convnd_fwd) add_subdirectory(convnd_fwd)
add_subdirectory(reduce) add_subdirectory(reduce)
add_subdirectory(conv2d_bwd_weight) add_subdirectory(conv2d_bwd_weight)
add_subdirectory(convnd_bwd_data)
add_subdirectory(block_to_ctile_map)
# DONOT add client_app, that is tested via CI independently
...@@ -22,7 +22,7 @@ int main() ...@@ -22,7 +22,7 @@ int main()
Row, Row,
Row, Row,
Row>( Row>(
true, 1, false, 1, M, N, K, K, N, N, BatchCount); true, 1, false, false, M, N, K, K, N, N, BatchCount);
pass = pass && ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t, pass = pass && ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
ck::half_t, ck::half_t,
...@@ -31,7 +31,7 @@ int main() ...@@ -31,7 +31,7 @@ int main()
Row, Row,
Col, Col,
Row>( Row>(
true, 1, false, 1, M, N, K, K, K, N, BatchCount); true, 1, false, false, M, N, K, K, K, N, BatchCount);
pass = pass && ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t, pass = pass && ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
ck::half_t, ck::half_t,
...@@ -40,7 +40,7 @@ int main() ...@@ -40,7 +40,7 @@ int main()
Col, Col,
Row, Row,
Row>( Row>(
true, 1, false, 1, M, N, K, M, N, N, BatchCount); true, 1, false, false, M, N, K, M, N, N, BatchCount);
pass = pass && ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t, pass = pass && ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
ck::half_t, ck::half_t,
...@@ -49,7 +49,7 @@ int main() ...@@ -49,7 +49,7 @@ int main()
Col, Col,
Col, Col,
Row>( Row>(
true, 1, false, 1, M, N, K, M, K, N, BatchCount); true, 1, false, false, M, N, K, M, K, N, BatchCount);
if(pass) if(pass)
{ {
......
add_gtest_executable(test_block_to_ctile_map test_block_to_ctile_map.cpp)
\ No newline at end of file
#include <ck/config.hpp>
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "gtest/gtest.h"
#include <iostream>
#include <vector>
using namespace ck;
static auto I0 = Number<0>{};
static auto I1 = Number<1>{};
static auto I2 = Number<2>{};
TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N00_M01_N01_DeviceCTileIndexCheck1)
{
const index_t M = 384;
const index_t N = 384;
const index_t MPerBlock = 128;
const index_t NPerBlock = 128;
const index_t MBlock = M / MPerBlock;
const index_t NBlock = N / NPerBlock;
const index_t M01 = 4;
const index_t N01 = 4;
auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
printf("(M, N, MPerBlock, NPerBlock, M01, N01) = (%d, %d, %d, %d, %d, %d)\n",
M,
N,
MPerBlock,
NPerBlock,
M01,
N01);
BlockToCTileMap_M00_N00_M01_N01<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n), true> tile_map(
c_grid_desc_m_n, M01, N01);
EXPECT_TRUE(tile_map.CheckValidity(c_grid_desc_m_n) == true);
EXPECT_TRUE(tile_map.CalculateGridSize(c_grid_desc_m_n) == 16);
// clang-format off
std::vector<std::vector<int>> expected_m0idx_n0idx_valid = {
{0, 0, 1},
{0, 1, 1},
{0, 2, 1},
{0, 3, 0},
{1, 0, 1},
{1, 1, 1},
{1, 2, 1},
{1, 3, 0},
{2, 0, 1},
{2, 1, 1},
{2, 2, 1},
{2, 3, 0},
{3, 0, 0},
{3, 1, 0},
{3, 2, 0},
{3, 3, 0}
};
// clang-format on
for(index_t i = 0; i < tile_map.CalculateGridSize(c_grid_desc_m_n); i++)
{
auto m0n0_idx = tile_map.CalculateBottomIndex(make_multi_index(i));
std::cout << "block_1d_id = " << i << ", m0, n0 = " << m0n0_idx[I0] << ", " << m0n0_idx[I1];
std::cout << ", valid = " << tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))
<< std::endl;
bool equal =
expected_m0idx_n0idx_valid[i] ==
std::vector<int>{m0n0_idx[I0],
m0n0_idx[I1],
tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))};
EXPECT_TRUE(equal);
}
}
TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N00_M01_N01_DeviceCTileIndexCheck0)
{
const index_t M = 384;
const index_t N = 384;
const index_t MPerBlock = 128;
const index_t NPerBlock = 128;
const index_t M01 = 4;
const index_t N01 = 4;
auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
printf("(M, N, MPerBlock, NPerBlock, M01, N01) = (%d, %d, %d, %d, %d, %d)\n",
M,
N,
MPerBlock,
NPerBlock,
M01,
N01);
BlockToCTileMap_M00_N00_M01_N01<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n), false>
tile_map(c_grid_desc_m_n, M01, N01);
EXPECT_TRUE(tile_map.CheckValidity(c_grid_desc_m_n) == false);
}
TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N0_M01_DeviceCTileIndexCheck1)
{
const index_t M = 384;
const index_t N = 512;
const index_t MPerBlock = 128;
const index_t NPerBlock = 128;
const index_t MBlock = M / MPerBlock;
const index_t NBlock = N / NPerBlock;
const index_t M01 = 4;
auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
printf("(M, N, MPerBlock, NPerBlock, M01) = (%d, %d, %d, %d, %d)\n",
M,
N,
MPerBlock,
NPerBlock,
M01);
BlockToCTileMap_M00_N0_M01<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n), true> tile_map(
c_grid_desc_m_n, M01);
EXPECT_TRUE(tile_map.CheckValidity(c_grid_desc_m_n) == true);
EXPECT_TRUE(tile_map.CalculateGridSize(c_grid_desc_m_n) == 16);
// clang-format off
std::vector<std::vector<int>> expected_m0idx_n0idx_valid = {
{0, 0, 1},
{1, 0, 1},
{2, 0, 1},
{3, 0, 0},
{0, 1, 1},
{1, 1, 1},
{2, 1, 1},
{3, 1, 0},
{0, 2, 1},
{1, 2, 1},
{2, 2, 1},
{3, 2, 0},
{0, 3, 1},
{1, 3, 1},
{2, 3, 1},
{3, 3, 0}
};
// clang-format on
for(index_t i = 0; i < tile_map.CalculateGridSize(c_grid_desc_m_n); i++)
{
auto m0n0_idx = tile_map.CalculateBottomIndex(make_multi_index(i));
std::cout << "block_1d_id = " << i << ", m0, n0 = " << m0n0_idx[I0] << ", " << m0n0_idx[I1];
std::cout << ", valid = " << tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))
<< std::endl;
bool equal =
expected_m0idx_n0idx_valid[i] ==
std::vector<int>{m0n0_idx[I0],
m0n0_idx[I1],
tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))};
EXPECT_TRUE(equal);
}
}
TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N0_M01_DeviceCTileIndexCheck0)
{
const index_t M = 512;
const index_t N = 384;
const index_t MPerBlock = 128;
const index_t NPerBlock = 128;
auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
// clang-format off
std::vector<std::tuple<int, int, bool>> expected_m0_gridsize_validity = {
{5, 15, false},
{4, 12, true},
{3, 18, false},
{2, 12, true},
{1, 12, true}
};
// clang-format on
for(auto e : expected_m0_gridsize_validity)
{
const index_t M01 = std::get<0>(e);
printf("(M, N, MPerBlock, NPerBlock, M01) = (%d, %d, %d, %d, %d)\n",
M,
N,
MPerBlock,
NPerBlock,
M01);
BlockToCTileMap_M00_N0_M01<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n), false> tile_map(
c_grid_desc_m_n, M01);
EXPECT_EQ(tile_map.CalculateGridSize(c_grid_desc_m_n), std::get<1>(e));
EXPECT_EQ(tile_map.CheckValidity(c_grid_desc_m_n), std::get<2>(e));
}
}
TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N0_M01Adapt)
{
const index_t M = 768;
const index_t N = 384;
const index_t MPerBlock = 128;
const index_t NPerBlock = 128;
const index_t MBlock = M / MPerBlock;
const index_t NBlock = N / NPerBlock;
constexpr index_t M01 = 4;
auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
printf("(M, N, MPerBlock, NPerBlock, M01) = (%d, %d, %d, %d, %d)\n",
M,
N,
MPerBlock,
NPerBlock,
M01);
BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n)> tile_map(
c_grid_desc_m_n, M01);
EXPECT_TRUE(tile_map.CheckValidity(c_grid_desc_m_n) == true);
EXPECT_TRUE(tile_map.CalculateGridSize(c_grid_desc_m_n) == 18);
// clang-format off
std::vector<std::vector<int>> expected_m0idx_n0idx_valid = {
{0, 0, 1},
{1, 0, 1},
{2, 0, 1},
{3, 0, 1},
{0, 1, 1},
{1, 1, 1},
{2, 1, 1},
{3, 1, 1},
{0, 2, 1},
{1, 2, 1},
{2, 2, 1},
{3, 2, 1},
{4, 0, 1},
{5, 0, 1},
{4, 1, 1},
{5, 1, 1},
{4, 2, 1},
{5, 2, 1},
};
// clang-format on
for(index_t i = 0; i < tile_map.CalculateGridSize(c_grid_desc_m_n); i++)
{
auto m0n0_idx = tile_map.CalculateBottomIndex(make_multi_index(i));
std::cout << "block_1d_id = " << i << ", m0, n0 = " << m0n0_idx[I0] << ", " << m0n0_idx[I1];
std::cout << ", valid = " << tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))
<< std::endl;
bool equal =
expected_m0idx_n0idx_valid[i] ==
std::vector<int>{m0n0_idx[I0],
m0n0_idx[I1],
tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))};
EXPECT_TRUE(equal);
}
}
TEST(BlockToCTileMap, TestBlockToCTileMap_KSplit_M00_N0_M01Adapt)
{
const index_t M = 768;
const index_t N = 384;
const index_t MPerBlock = 128;
const index_t NPerBlock = 128;
const index_t MBlock = M / MPerBlock;
const index_t NBlock = N / NPerBlock;
constexpr index_t M01 = 4;
const index_t KSplit = 3;
auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
printf("(M, N, MPerBlock, NPerBlock, M01) = (%d, %d, %d, %d, %d)\n",
M,
N,
MPerBlock,
NPerBlock,
M01);
BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n)>
tile_map(c_grid_desc_m_n, M01, KSplit);
EXPECT_TRUE(tile_map.CheckValidity(c_grid_desc_m_n) == true);
EXPECT_TRUE(tile_map.CalculateGridSize(c_grid_desc_m_n) == 18 * KSplit);
std::vector<std::vector<int>> expected_ksplitidx_m0idx_n0idx_valid = {
{0, 0, 0, 1}, {0, 1, 0, 1}, {0, 2, 0, 1}, {0, 3, 0, 1}, {0, 0, 1, 1}, {0, 1, 1, 1},
{0, 2, 1, 1}, {0, 3, 1, 1}, {0, 0, 2, 1}, {0, 1, 2, 1}, {0, 2, 2, 1}, {0, 3, 2, 1},
{0, 4, 0, 1}, {0, 5, 0, 1}, {0, 4, 1, 1}, {0, 5, 1, 1}, {0, 4, 2, 1}, {0, 5, 2, 1},
{1, 0, 0, 1}, {1, 1, 0, 1}, {1, 2, 0, 1}, {1, 3, 0, 1}, {1, 0, 1, 1}, {1, 1, 1, 1},
{1, 2, 1, 1}, {1, 3, 1, 1}, {1, 0, 2, 1}, {1, 1, 2, 1}, {1, 2, 2, 1}, {1, 3, 2, 1},
{1, 4, 0, 1}, {1, 5, 0, 1}, {1, 4, 1, 1}, {1, 5, 1, 1}, {1, 4, 2, 1}, {1, 5, 2, 1},
{2, 0, 0, 1}, {2, 1, 0, 1}, {2, 2, 0, 1}, {2, 3, 0, 1}, {2, 0, 1, 1}, {2, 1, 1, 1},
{2, 2, 1, 1}, {2, 3, 1, 1}, {2, 0, 2, 1}, {2, 1, 2, 1}, {2, 2, 2, 1}, {2, 3, 2, 1},
{2, 4, 0, 1}, {2, 5, 0, 1}, {2, 4, 1, 1}, {2, 5, 1, 1}, {2, 4, 2, 1}, {2, 5, 2, 1},
};
for(index_t i = 0; i < tile_map.CalculateGridSize(c_grid_desc_m_n); i++)
{
auto ksplitm0n0_idx = tile_map.CalculateBottomIndex(make_multi_index(i));
std::cout << "block_1d_id = " << i << ", ksplit, m0, n0 = " << ksplitm0n0_idx[I0] << ", "
<< ksplitm0n0_idx[I1] << ", " << ksplitm0n0_idx[I2];
std::cout << ", valid = "
<< tile_map.ValidCTileIndex(ksplitm0n0_idx, make_tuple(MBlock, NBlock))
<< std::endl;
bool equal =
expected_ksplitidx_m0idx_n0idx_valid[i] ==
std::vector<int>{ksplitm0n0_idx[I0],
ksplitm0n0_idx[I1],
ksplitm0n0_idx[I2],
tile_map.ValidCTileIndex(ksplitm0n0_idx, make_tuple(MBlock, NBlock))};
EXPECT_TRUE(equal);
}
}
cmake_minimum_required(VERSION 3.15)
project(ck_app)
add_compile_options(-std=c++14)
find_package(composable_kernel 1.0.0 COMPONENTS device_operations host_tensor)
find_package(hip REQUIRED PATHS /opt/rocm)
message(STATUS "Build with HIP ${hip_VERSION}")
add_executable(test_client_app client_app.cpp)
target_link_libraries(test_client_app PRIVATE composable_kernel::device_operations composable_kernel::host_tensor hip::host)
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include <vector>
#include "client_app_impl.hpp"
int main(int argc, char* argv[])
{
if(argc != 25)
{
printf("arg1: tensor operation (conv_fwd: ForwardConvolution)\n");
printf("arg2: data type (0: fp32; 1: fp16)\n");
printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
printf("arg6: verification (0: no; 1: yes)\n");
printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
printf("arg9: time kernel (0=n0, 1=yes)\n");
printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(1);
}
const ConvDataType data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
const int in_layout = static_cast<ConvInputLayout>(std::stoi(argv[3]));
const int wei_layout = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
const int out_layout = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
const bool do_verification = std::stoi(argv[6]);
const int init_method = std::stoi(argv[7]);
const bool do_log = std::stoi(argv[8]);
const bool time_kernel = std::stoi(argv[9]);
const ck::index_t N = std::stoi(argv[10]);
const ck::index_t K = std::stoi(argv[11]);
const ck::index_t C = std::stoi(argv[12]);
const ck::index_t Y = std::stoi(argv[13]);
const ck::index_t X = std::stoi(argv[14]);
const ck::index_t Hi = std::stoi(argv[15]);
const ck::index_t Wi = std::stoi(argv[16]);
const ck::index_t conv_stride_h = std::stoi(argv[17]);
const ck::index_t conv_stride_w = std::stoi(argv[18]);
const ck::index_t conv_dilation_h = std::stoi(argv[19]);
const ck::index_t conv_dilation_w = std::stoi(argv[20]);
const ck::index_t in_left_pad_h = std::stoi(argv[21]);
const ck::index_t in_left_pad_w = std::stoi(argv[22]);
const ck::index_t in_right_pad_h = std::stoi(argv[23]);
const ck::index_t in_right_pad_w = std::stoi(argv[24]);
const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
ck::app::profile_conv_fwd_impl(do_verification,
init_method,
do_log,
time_kernel,
data_type,
N,
K,
C,
std::vector<ck::index_t>{Hi, Wi},
std::vector<ck::index_t>{Y, X},
std::vector<ck::index_t>{Ho, Wo},
std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
return 1;
}
#pragma once
#include "host_interface.hpp"
enum ConvDataType
{
F32_F32_F32, // 0
F16_F16_F16, // 1
BF16_BF16_BF16, // 2
INT8_INT8_INT8, // 3
};
enum ConvInputLayout
{
NCHW, // 0
NHWC, // 1
};
enum ConvWeightLayout
{
KCYX, // 0
KYXC, // 1
};
enum ConvOutputLayout
{
NKHW, // 0
NHWK, // 1
};
void check_hip_error(void)
{
hipError_t err = hipGetLastError();
if(err != hipSuccess)
{
std::cerr << "Error: " << hipGetErrorString(err) << std::endl;
exit(err);
}
}
std::string getDeviceName(int device)
{
struct hipDeviceProp_t prop;
hipGetDeviceProperties(&prop, device);
check_hip_error();
return std::string(prop.name);
}
int getDriver(void)
{
int driver;
hipDriverGetVersion(&driver);
check_hip_error();
return driver;
}
namespace ck {
namespace app {
struct DeviceMem
{
DeviceMem() = delete;
DeviceMem(std::size_t mem_size);
void* GetDeviceBuffer();
void ToDevice(const void* p);
void FromDevice(void* p);
~DeviceMem();
void* mpDeviceBuf;
std::size_t mMemSize;
};
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
{
hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
}
void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
void DeviceMem::ToDevice(const void* p)
{
hipGetErrorString(
hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
}
void DeviceMem::FromDevice(void* p)
{
hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
}
DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
void profile_conv_fwd_impl(int do_verification,
int init_method,
bool do_log,
bool time_kernel,
ConvDataType data_type,
ck::index_t N,
ck::index_t K,
ck::index_t C,
std::vector<ck::index_t> input_spatial_lengths,
std::vector<ck::index_t> filter_spatial_lengths,
std::vector<ck::index_t> output_spatial_lengths,
std::vector<ck::index_t> conv_filter_strides,
std::vector<ck::index_t> conv_filter_dilations,
std::vector<ck::index_t> input_left_pads,
std::vector<ck::index_t> input_right_pads)
{
const ck::index_t Y = filter_spatial_lengths[0];
const ck::index_t X = filter_spatial_lengths[1];
const ck::index_t Hi = input_spatial_lengths[0];
const ck::index_t Wi = input_spatial_lengths[1];
const ck::index_t Ho = output_spatial_lengths[0];
const ck::index_t Wo = output_spatial_lengths[1];
const auto in_sz = N * C * Hi * Wi;
const auto wei_sz = K * C * Y * X;
const auto out_sz = N * K * Ho * Wo;
using WeiDataType = float;
using InDataType = float;
using OutDataType = float;
app::DeviceMem in_device_buf(sizeof(InDataType) * in_sz);
app::DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_sz);
app::DeviceMem out_device_buf(sizeof(OutDataType) * out_sz);
// data is already on device!
// add device Conv instances
std::vector<DeviceConvFwdPtr_t> conv_ptrs;
if(data_type == F16_F16_F16)
{
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t(conv_ptrs);
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t(conv_ptrs);
}
else if(data_type == BF16_BF16_BF16)
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t(conv_ptrs);
else if(data_type == F32_F32_F32)
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t(conv_ptrs);
else if(data_type == INT8_INT8_INT8)
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(conv_ptrs);
else
throw std::runtime_error("wrong! Invalid data type");
if(conv_ptrs.empty())
{
throw std::runtime_error("wrong! no device Conv instance found");
}
std::string best_conv_name;
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
int deviceIndex = 0;
hipSetDevice(deviceIndex);
check_hip_error();
StreamConfig stream_config{nullptr, time_kernel};
hipStreamCreate(&stream_config.stream_id_);
check_hip_error();
// profile device Conv instances
for(auto& conv_ptr : conv_ptrs)
{
auto argument_ptr =
conv_ptr.MakeArgumentPointer(static_cast<void*>(in_device_buf.GetDeviceBuffer()),
static_cast<void*>(wei_device_buf.GetDeviceBuffer()),
static_cast<void*>(out_device_buf.GetDeviceBuffer()),
N,
K,
C,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads);
auto invoker_ptr = conv_ptr.MakeInvokerPointer();
if(conv_ptr.IsSupportedArgument(argument_ptr.get()))
{
std::string conv_name = conv_ptr.GetTypeString();
float ave_time = invoker_ptr->Run(argument_ptr.get(), stream_config);
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
sizeof(WeiDataType) * (K * C * Y * X) +
sizeof(OutDataType) * (N * K * Ho * Wo);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
<< " GB/s, " << conv_name << std::endl;
if(tflops > best_tflops)
{
best_conv_name = conv_name;
best_tflops = tflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
}
} // namespace app
} // namespace ck
...@@ -4,4 +4,4 @@ include_directories(BEFORE ...@@ -4,4 +4,4 @@ include_directories(BEFORE
) )
add_test_executable(test_conv2d_bwd_weight conv2d_bwd_weight.cpp) add_test_executable(test_conv2d_bwd_weight conv2d_bwd_weight.cpp)
target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor device_conv2d_bwd_weight_instance conv_fwd_util) target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor device_conv2d_bwd_weight_instance conv_util)
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include <half.hpp> #include <half.hpp>
#include <vector> #include <vector>
#include "conv_fwd_util.hpp" #include "conv_util.hpp"
#include "profile_conv_bwd_weight_impl.hpp" #include "profile_conv_bwd_weight_impl.hpp"
int test_self() int test_self()
...@@ -28,20 +28,20 @@ int test_self() ...@@ -28,20 +28,20 @@ int test_self()
ck::tensor_layout::convolution::NHWC, ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC, ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK>( ck::tensor_layout::convolution::NHWK>(
1, // do_verification, true, // do_verification
1, // init_method, 1, // init_method
0, // do_log, false, // do_log
1, // nrepeat, false, // time_kernel
param.N, param.N_,
param.K, param.K_,
param.C, param.C_,
param.input_spatial_lengths, param.input_spatial_lengths_,
param.filter_spatial_lengths, param.filter_spatial_lengths_,
param.GetOutputSpatialLengths(), param.GetOutputSpatialLengths(),
param.conv_filter_strides, param.conv_filter_strides_,
param.conv_filter_dilations, param.conv_filter_dilations_,
param.input_left_pads, param.input_left_pads_,
param.input_right_pads, param.input_right_pads_,
2); 2);
// fp16 // fp16
...@@ -52,28 +52,28 @@ int test_self() ...@@ -52,28 +52,28 @@ int test_self()
ck::tensor_layout::convolution::NHWC, ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC, ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK>( ck::tensor_layout::convolution::NHWK>(
1, // do_verification, true, // do_verification
1, // init_method, 1, // init_method
0, // do_log, false, // do_log
1, // nrepeat, false, // time_kernel
param.N, param.N_,
param.K, param.K_,
param.C, param.C_,
param.input_spatial_lengths, param.input_spatial_lengths_,
param.filter_spatial_lengths, param.filter_spatial_lengths_,
param.GetOutputSpatialLengths(), param.GetOutputSpatialLengths(),
param.conv_filter_strides, param.conv_filter_strides_,
param.conv_filter_dilations, param.conv_filter_dilations_,
param.input_left_pads, param.input_left_pads_,
param.input_right_pads, param.input_right_pads_,
2); 2);
} }
return pass; return pass;
} }
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
int data_type = 0; int data_type = 1;
int init_method = 0; int init_method = 1;
// Conv shape // Conv shape
ck::index_t N = 128; ck::index_t N = 128;
...@@ -155,20 +155,20 @@ int main(int argc, char* argv[]) ...@@ -155,20 +155,20 @@ int main(int argc, char* argv[])
ck::tensor_layout::convolution::NHWC, ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC, ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK>( ck::tensor_layout::convolution::NHWK>(
1, true, // do_verification
init_method, init_method,
0, false, // do_log
1, false, // time_kernel
param.N, param.N_,
param.K, param.K_,
param.C, param.C_,
param.input_spatial_lengths, param.input_spatial_lengths_,
param.filter_spatial_lengths, param.filter_spatial_lengths_,
param.GetOutputSpatialLengths(), param.GetOutputSpatialLengths(),
param.conv_filter_strides, param.conv_filter_strides_,
param.conv_filter_dilations, param.conv_filter_dilations_,
param.input_left_pads, param.input_left_pads_,
param.input_right_pads, param.input_right_pads_,
split_k); split_k);
} }
else if(data_type == 1) else if(data_type == 1)
...@@ -180,20 +180,20 @@ int main(int argc, char* argv[]) ...@@ -180,20 +180,20 @@ int main(int argc, char* argv[])
ck::tensor_layout::convolution::NHWC, ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC, ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK>( ck::tensor_layout::convolution::NHWK>(
1, true, // do_verification
init_method, init_method,
0, false, // do_log
1, false, // time_kernel
param.N, param.N_,
param.K, param.K_,
param.C, param.C_,
param.input_spatial_lengths, param.input_spatial_lengths_,
param.filter_spatial_lengths, param.filter_spatial_lengths_,
param.GetOutputSpatialLengths(), param.GetOutputSpatialLengths(),
param.conv_filter_strides, param.conv_filter_strides_,
param.conv_filter_dilations, param.conv_filter_dilations_,
param.input_left_pads, param.input_left_pads_,
param.input_right_pads, param.input_right_pads_,
split_k); split_k);
} }
else else
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment