Unverified Commit cc89ee59 authored by Ziyue Yang's avatar Ziyue Yang Committed by GitHub
Browse files

Benchmarks: Revise Code - Add hipblasLt tuning to dist-inference cpp implementation (#616)

**Description**
Adds hipblasLt tuning to dist-inference cpp implementation.
parent eeaa9b1a
...@@ -312,6 +312,12 @@ def add_parser_arguments(self): ...@@ -312,6 +312,12 @@ def add_parser_arguments(self):
required=False, required=False,
help='Whether to launch kernels in CUDA graph mode.', help='Whether to launch kernels in CUDA graph mode.',
) )
self._parser.add_argument(
'--tune_gemm',
action='store_true',
required=False,
help='Whether to tune GEMM performance before testing.',
)
def _preprocess(self): def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking. """Preprocess/preparation operations before the benchmarking.
...@@ -356,6 +362,8 @@ def _preprocess(self): ...@@ -356,6 +362,8 @@ def _preprocess(self):
(self._args.num_layers, self._args.num_warmup, self._args.num_steps) (self._args.num_layers, self._args.num_warmup, self._args.num_steps)
if self._args.use_cuda_graph: if self._args.use_cuda_graph:
args += ' --use_cuda_graph' args += ' --use_cuda_graph'
if self._args.tune_gemm:
args += ' --tune_gemm'
self._commands = ['%s %s' % (self.__bin_path, args)] self._commands = ['%s %s' % (self.__bin_path, args)]
return True return True
......
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
* *
*******************************************************************************/ *******************************************************************************/
#include <algorithm>
#include <chrono> #include <chrono>
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
...@@ -60,6 +61,21 @@ using cublasLtHalf = hipblasLtHalf; ...@@ -60,6 +61,21 @@ using cublasLtHalf = hipblasLtHalf;
#else #else
#define DIST_INF_HIP_COMPUTETYPE_F32 HIPBLASLT_COMPUTE_F32 #define DIST_INF_HIP_COMPUTETYPE_F32 HIPBLASLT_COMPUTE_F32
#endif #endif
#if HIP_VERSION >= 50700000
#include <hipblaslt/hipblaslt-ext.hpp>
#if HIP_VERSION >= 60000000
#define HIPBLASLT_GETINDEXFROMALGO(algo) hipblaslt_ext::getIndexFromAlgo(algo)
#else
static int getIndexFromAlgo(hipblasLtMatmulAlgo_t &algo) {
int *algo_ptr = (int *)algo.data;
if (*algo_ptr < 0) {
return -1;
}
return *algo_ptr;
}
#define HIPBLASLT_GETINDEXFROMALGO(algo) getIndexFromAlgo(algo)
#endif
#endif
#else #else
#include <cublasLt.h> #include <cublasLt.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
...@@ -94,7 +110,8 @@ using cublasLtHalf = half; ...@@ -94,7 +110,8 @@ using cublasLtHalf = half;
#endif #endif
static void ShowUsage(char *argv[]) { static void ShowUsage(char *argv[]) {
std::cerr << "Usage: " << argv[0] << " <options>\n" std::cerr
<< "Usage: " << argv[0] << " <options>\n"
<< "options:\n" << "options:\n"
<< "\t-h, --help\t\t\t\tShow this help message\n" << "\t-h, --help\t\t\t\tShow this help message\n"
<< "\t-m \t\t\tm\t\tGEMM_STRIDED argument m\n" << "\t-m \t\t\tm\t\tGEMM_STRIDED argument m\n"
...@@ -106,11 +123,13 @@ static void ShowUsage(char *argv[]) { ...@@ -106,11 +123,13 @@ static void ShowUsage(char *argv[]) {
<< "\t--num_warmups \t\t\tnum_warmups \t\tNumber of warmup runs\n" << "\t--num_warmups \t\t\tnum_warmups \t\tNumber of warmup runs\n"
<< "\t--num_iters \t\t\tnum_iters \t\tNumber of test runs\n" << "\t--num_iters \t\t\tnum_iters \t\tNumber of test runs\n"
<< "\t--use_cuda_graph \t\t\tuse_cuda_graph \t\tWhether to launch kernels in CUDA graph mode\n" << "\t--use_cuda_graph \t\t\tuse_cuda_graph \t\tWhether to launch kernels in CUDA graph mode\n"
<< "\t--tune_gemm \t\t\ttune_gemm \t\tWhether to tune GEMM before testing. Currently only work for hipblasLt.\n"
<< std::endl; << std::endl;
} }
static int ParseArguments(int argc, char *argv[], int64_t *m, int64_t *n, int64_t *k, float *alpha, float *beta, static int ParseArguments(int argc, char *argv[], int64_t *m, int64_t *n, int64_t *k, float *alpha, float *beta,
int32_t *num_layers, int32_t *num_warmups, int32_t *num_iters, bool *use_cuda_graph) { int32_t *num_layers, int32_t *num_warmups, int32_t *num_iters, bool *use_cuda_graph,
bool *tune_gemm) {
if (argc >= 2) { if (argc >= 2) {
for (int i = 1; i < argc; ++i) { for (int i = 1; i < argc; ++i) {
std::string arg = argv[i]; std::string arg = argv[i];
...@@ -143,6 +162,8 @@ static int ParseArguments(int argc, char *argv[], int64_t *m, int64_t *n, int64_ ...@@ -143,6 +162,8 @@ static int ParseArguments(int argc, char *argv[], int64_t *m, int64_t *n, int64_
std::cerr << "not supported by current environment" << std::endl << std::endl; std::cerr << "not supported by current environment" << std::endl << std::endl;
return -1; return -1;
#endif #endif
} else if (arg == "--tune_gemm") {
*tune_gemm = true;
} else { } else {
std::cerr << "error with " << arg << std::endl; std::cerr << "error with " << arg << std::endl;
std::cerr << "do not recognize option" << std::endl << std::endl; std::cerr << "do not recognize option" << std::endl << std::endl;
...@@ -182,10 +203,91 @@ void InitializeABCDEF(std::vector<cublasLtHalf> &ha, int64_t size_a, std::vector ...@@ -182,10 +203,91 @@ void InitializeABCDEF(std::vector<cublasLtHalf> &ha, int64_t size_a, std::vector
} }
} }
#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION >= 50700000
// Tune GEMM algorithm in local rank.
// Write <0 to ret_algo_time_in_ms if nothing found.
// Write >=0 to ret_algo_time_in_ms and write ret_algo if something is found.
void TuneHipblasLtGemmLocal(const hipblasLtHandle_t &handle, const hipblasLtMatmulDesc_t &matmul, float alpha, void *da,
const hipblasLtMatrixLayout_t &matA, void *db, const hipblasLtMatrixLayout_t &matB,
float beta, void *dc, const hipblasLtMatrixLayout_t &matC, void *dd,
const hipblasLtMatrixLayout_t &matD, void *d_workspace, uint64_t workspace_size,
const cudaStream_t &stream, int rank, int num_ranks, hipblasLtMatmulAlgo_t *ret_algo,
float *ret_algo_time_in_ms) {
std::vector<hipblasLtMatmulHeuristicResult_t> gemm_heuristics;
// Get all possible algorithms
CHECK_CUBLASLT_ERROR(hipblaslt_ext::getAllAlgos(
handle, hipblaslt_ext::GemmType::HIPBLASLT_GEMM, HIPBLAS_OP_N, HIPBLAS_OP_N, DIST_INF_HIP_DATATYPE_R_16F,
DIST_INF_HIP_DATATYPE_R_16F, DIST_INF_HIP_DATATYPE_R_16F, DIST_INF_HIP_DATATYPE_R_16F,
DIST_INF_HIP_COMPUTETYPE_F32, gemm_heuristics));
// Make sure the algorithm order is deterministic
std::sort(gemm_heuristics.begin(), gemm_heuristics.end(),
[](hipblasLtMatmulHeuristicResult_t &a, hipblasLtMatmulHeuristicResult_t &b) {
return HIPBLASLT_GETINDEXFROMALGO(a.algo) < HIPBLASLT_GETINDEXFROMALGO(b.algo);
});
// Timing utilities
cudaEvent_t start_event;
cudaEvent_t end_event;
const int kNumWarmups = 10;
const int kNumTestRuns = 100;
*ret_algo_time_in_ms = -1;
// Benchmark all algorithms in given shape
CHECK_CUDA_ERROR(cudaEventCreate(&start_event));
CHECK_CUDA_ERROR(cudaEventCreate(&end_event));
// Partition work evenly into different ranks
for (size_t algo_idx = rank; algo_idx < gemm_heuristics.size(); algo_idx += num_ranks) {
auto &algo = gemm_heuristics[algo_idx].algo;
size_t ret_workspace_size = 0;
auto status = hipblaslt_ext::matmulIsAlgoSupported(handle, matmul, &alpha, matA, matB, &beta, matC, matD, algo,
ret_workspace_size);
if (status != HIPBLAS_STATUS_SUCCESS || ret_workspace_size >= workspace_size) {
continue;
}
for (int i = 0; i < kNumWarmups; i++) {
CHECK_CUBLASLT_ERROR(hipblasLtMatmul(handle, matmul, &alpha, da, matA, db, matB, &beta, dc, matC, dd, matD,
&algo, d_workspace, workspace_size, stream));
}
CHECK_CUDA_ERROR(cudaEventRecord(start_event, stream));
for (int i = 0; i < kNumTestRuns; i++) {
CHECK_CUBLASLT_ERROR(hipblasLtMatmul(handle, matmul, &alpha, da, matA, db, matB, &beta, dc, matC, dd, matD,
&algo, d_workspace, workspace_size, stream));
}
CHECK_CUDA_ERROR(cudaEventRecord(end_event, stream));
CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
float time_in_ms = 0;
CHECK_CUDA_ERROR(cudaEventElapsedTime(&time_in_ms, start_event, end_event));
time_in_ms /= kNumTestRuns;
if (*ret_algo_time_in_ms < 0 || time_in_ms < *ret_algo_time_in_ms) {
*ret_algo = algo;
*ret_algo_time_in_ms = time_in_ms;
}
}
CHECK_CUDA_ERROR(cudaEventDestroy(start_event));
CHECK_CUDA_ERROR(cudaEventDestroy(end_event));
}
// Select global best GEMM algorithms across ranks. Write global_algo if something is found.
void TuneHipblasLtGemmGlobal(int num_ranks, const hipblasLtMatmulAlgo_t &local_algo, float local_time_in_ms,
hipblasLtMatmulAlgo_t *global_algo) {
std::vector<hipblasLtMatmulAlgo_t> coll_algo(num_ranks);
std::vector<float> coll_time_in_ms(num_ranks);
MPI_Allgather(&local_algo, sizeof(local_algo), MPI_BYTE, coll_algo.data(), sizeof(local_algo), MPI_BYTE,
MPI_COMM_WORLD);
MPI_Allgather(&local_time_in_ms, sizeof(local_time_in_ms), MPI_BYTE, coll_time_in_ms.data(),
sizeof(local_time_in_ms), MPI_BYTE, MPI_COMM_WORLD);
float min_time_in_ms = -1;
for (int i = 0; i < num_ranks; i++) {
if (coll_time_in_ms[i] >= 0 && (min_time_in_ms < 0 || coll_time_in_ms[i] < min_time_in_ms)) {
min_time_in_ms = coll_time_in_ms[i];
*global_algo = coll_algo[i];
}
}
}
#endif
// B[m, k] * A[k, n] + C[m, n] = D[m, n] // B[m, k] * A[k, n] + C[m, n] = D[m, n]
// E[k, m] * D[m, n] + F[k, n] = G[k, n] // E[k, m] * D[m, n] + F[k, n] = G[k, n]
void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t num_layers, int32_t num_warmups, void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t num_layers, int32_t num_warmups,
int32_t num_iters, bool use_cuda_graph, ncclComm_t nccl_comm) { int32_t num_iters, bool use_cuda_graph, bool tune_gemm, ncclComm_t nccl_comm, int rank, int num_ranks) {
const int kNcclBufAlignment = 512; const int kNcclBufAlignment = 512;
int size_a = k * n; int size_a = k * n;
...@@ -230,7 +332,11 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t ...@@ -230,7 +332,11 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t
CHECK_CUDA_ERROR(cudaMemcpy(de, he.data(), sizeof(cublasLtHalf) * size_e, cudaMemcpyHostToDevice)); CHECK_CUDA_ERROR(cudaMemcpy(de, he.data(), sizeof(cublasLtHalf) * size_e, cudaMemcpyHostToDevice));
CHECK_CUDA_ERROR(cudaMemcpy(df, hf.data(), sizeof(cublasLtHalf) * size_f, cudaMemcpyHostToDevice)); CHECK_CUDA_ERROR(cudaMemcpy(df, hf.data(), sizeof(cublasLtHalf) * size_f, cudaMemcpyHostToDevice));
#if defined(__HIP_PLATFORM_AMD__)
uint64_t workspace_size = 256 * 1024 * 1024; // max workspace size allowed for hipblaslt
#else
uint64_t workspace_size = 1024 * 1024; uint64_t workspace_size = 1024 * 1024;
#endif
void *d_workspace; void *d_workspace;
CHECK_CUDA_ERROR(cudaMalloc(&d_workspace, workspace_size)); CHECK_CUDA_ERROR(cudaMalloc(&d_workspace, workspace_size));
int returnedAlgoCount = 0; int returnedAlgoCount = 0;
...@@ -279,8 +385,22 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t ...@@ -279,8 +385,22 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t
// E[k, m] * D[m, n] + F[k, n] = G[k, n] // E[k, m] * D[m, n] + F[k, n] = G[k, n]
CHECK_CUBLASLT_ERROR(hipblasLtMatmulAlgoGetHeuristic(handle, matmul1, matB, matA, matC, matD, pref, 1, CHECK_CUBLASLT_ERROR(hipblasLtMatmulAlgoGetHeuristic(handle, matmul1, matB, matA, matC, matD, pref, 1,
heuristicResult1, &returnedAlgoCount)); heuristicResult1, &returnedAlgoCount));
hipblasLtMatmulAlgo_t algo1 = heuristicResult1[0].algo;
CHECK_CUBLASLT_ERROR(hipblasLtMatmulAlgoGetHeuristic(handle, matmul2, matE, matD, matF, matG, pref, 1, CHECK_CUBLASLT_ERROR(hipblasLtMatmulAlgoGetHeuristic(handle, matmul2, matE, matD, matF, matG, pref, 1,
heuristicResult2, &returnedAlgoCount)); heuristicResult2, &returnedAlgoCount));
hipblasLtMatmulAlgo_t algo2 = heuristicResult2[0].algo;
#if HIP_VERSION >= 50700000
if (tune_gemm) {
hipblasLtMatmulAlgo_t ret_algo;
float ret_algo_time_in_ms;
TuneHipblasLtGemmLocal(handle, matmul1, alpha, db, matB, da, matA, beta, dc, matC, dd, matD, d_workspace,
workspace_size, stream, rank, num_ranks, &ret_algo, &ret_algo_time_in_ms);
TuneHipblasLtGemmGlobal(num_ranks, ret_algo, ret_algo_time_in_ms, &algo1);
TuneHipblasLtGemmLocal(handle, matmul2, alpha, de, matE, dd, matD, beta, df, matF, dg, matG, d_workspace,
workspace_size, stream, rank, num_ranks, &ret_algo, &ret_algo_time_in_ms);
TuneHipblasLtGemmGlobal(num_ranks, ret_algo, ret_algo_time_in_ms, &algo2);
}
#endif
#else #else
cublasLtHandle_t handle; cublasLtHandle_t handle;
cublasLtMatrixLayout_t matA, matB, matC, matD, matE, matF, matG; cublasLtMatrixLayout_t matA, matB, matC, matD, matE, matF, matG;
...@@ -328,13 +448,13 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t ...@@ -328,13 +448,13 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t
// cublasLt is not well supported by ROCm hipify tools, explicitly define ROCm logic instead. // cublasLt is not well supported by ROCm hipify tools, explicitly define ROCm logic instead.
#if defined(__HIP_PLATFORM_AMD__) #if defined(__HIP_PLATFORM_AMD__)
CHECK_CUBLASLT_ERROR(hipblasLtMatmul(handle, matmul1, &alpha, db, matB, da, matA, &beta, dc, matC, dd, matD, CHECK_CUBLASLT_ERROR(hipblasLtMatmul(handle, matmul1, &alpha, db, matB, da, matA, &beta, dc, matC, dd, matD,
&heuristicResult1[0].algo, d_workspace, workspace_size, stream)); &algo1, d_workspace, workspace_size, stream));
CHECK_CUBLASLT_ERROR(hipblasLtMatmul(handle, matmul1, &alpha, de, matE, dd, matD, &beta, df, matF, dg, matG, CHECK_CUBLASLT_ERROR(hipblasLtMatmul(handle, matmul2, &alpha, de, matE, dd, matD, &beta, df, matF, dg, matG,
&heuristicResult2[0].algo, d_workspace, workspace_size, stream)); &algo2, d_workspace, workspace_size, stream));
#else #else
CHECK_CUBLASLT_ERROR(cublasLtMatmul(handle, matmul1, &alpha, db, matB, da, matA, &beta, dc, matC, dd, matD, CHECK_CUBLASLT_ERROR(cublasLtMatmul(handle, matmul1, &alpha, db, matB, da, matA, &beta, dc, matC, dd, matD,
&heuristicResult1[0].algo, d_workspace, workspace_size, stream)); &heuristicResult1[0].algo, d_workspace, workspace_size, stream));
CHECK_CUBLASLT_ERROR(cublasLtMatmul(handle, matmul1, &alpha, de, matE, dd, matD, &beta, df, matF, dg, matG, CHECK_CUBLASLT_ERROR(cublasLtMatmul(handle, matmul2, &alpha, de, matE, dd, matD, &beta, df, matF, dg, matG,
&heuristicResult2[0].algo, d_workspace, workspace_size, stream)); &heuristicResult2[0].algo, d_workspace, workspace_size, stream));
#endif #endif
CHECK_NCCL_ERROR(ncclAllReduce(dg, dg, size_g, ncclFloat16, ncclSum, nccl_comm, stream)); CHECK_NCCL_ERROR(ncclAllReduce(dg, dg, size_g, ncclFloat16, ncclSum, nccl_comm, stream));
...@@ -456,18 +576,21 @@ int main(int argc, char *argv[]) { ...@@ -456,18 +576,21 @@ int main(int argc, char *argv[]) {
int32_t num_warmups = 20; int32_t num_warmups = 20;
int32_t num_iters = 100; int32_t num_iters = 100;
bool use_cuda_graph = false; bool use_cuda_graph = false;
bool tune_gemm = false;
if (ParseArguments(argc, argv, &m, &n, &k, &alpha, &beta, &num_layers, &num_warmups, &num_iters, &use_cuda_graph)) { if (ParseArguments(argc, argv, &m, &n, &k, &alpha, &beta, &num_layers, &num_warmups, &num_iters, &use_cuda_graph,
&tune_gemm)) {
ShowUsage(argv); ShowUsage(argv);
return -1; return -1;
} }
fprintf(stdout, fprintf(stdout,
"Parameters: m=%ld, n=%ld, k=%ld, alpha=%f, beta=%f, num_layers=%d, num_warmups=%d, num_iters=%d, " "Parameters: m=%ld, n=%ld, k=%ld, alpha=%f, beta=%f, num_layers=%d, num_warmups=%d, num_iters=%d, "
"use_cuda_graph=%d\n", "use_cuda_graph=%d, tune_gemm=%d\n",
m, n, k, alpha, beta, num_layers, num_warmups, num_iters, (int)use_cuda_graph); m, n, k, alpha, beta, num_layers, num_warmups, num_iters, (int)use_cuda_graph, (int)tune_gemm);
TestModel(m, n, k, alpha, beta, num_layers, num_warmups, num_iters, use_cuda_graph, nccl_comm); TestModel(m, n, k, alpha, beta, num_layers, num_warmups, num_iters, use_cuda_graph, tune_gemm, nccl_comm, comm_rank,
comm_size);
CHECK_NCCL_ERROR(ncclCommDestroy(nccl_comm)); CHECK_NCCL_ERROR(ncclCommDestroy(nccl_comm));
......
...@@ -53,6 +53,7 @@ def test_pytorch_dist_inference_normal(): ...@@ -53,6 +53,7 @@ def test_pytorch_dist_inference_normal():
assert (benchmark._args.distributed_impl == DistributedImpl.DDP) assert (benchmark._args.distributed_impl == DistributedImpl.DDP)
assert (benchmark._args.distributed_backend == DistributedBackend.NCCL) assert (benchmark._args.distributed_backend == DistributedBackend.NCCL)
assert (benchmark._args.use_cuda_graph is False) assert (benchmark._args.use_cuda_graph is False)
assert (benchmark._args.tune_gemm is False)
# Check results and metrics. # Check results and metrics.
assert (benchmark.run_count == 1) assert (benchmark.run_count == 1)
...@@ -98,6 +99,7 @@ def test_pytorch_dist_inference_fake_distributed(): ...@@ -98,6 +99,7 @@ def test_pytorch_dist_inference_fake_distributed():
assert (benchmark._args.distributed_impl == DistributedImpl.DDP) assert (benchmark._args.distributed_impl == DistributedImpl.DDP)
assert (benchmark._args.distributed_backend == DistributedBackend.NCCL) assert (benchmark._args.distributed_backend == DistributedBackend.NCCL)
assert (benchmark._args.use_cuda_graph is False) assert (benchmark._args.use_cuda_graph is False)
assert (benchmark._args.tune_gemm is False)
# Check results and metrics. # Check results and metrics.
assert (benchmark.run_count == 1) assert (benchmark.run_count == 1)
...@@ -136,7 +138,7 @@ def _test_dist_inference_command_generation(self, platform): ...@@ -136,7 +138,7 @@ def _test_dist_inference_command_generation(self, platform):
num_steps = 8 num_steps = 8
wrapper_params_format_str = \ wrapper_params_format_str = \
'--batch_size %d --input_size %d --hidden_size %d ' \ '--batch_size %d --input_size %d --hidden_size %d ' \
'--alpha %g --beta %g --num_layers %d --num_warmup %d --num_steps %d --use_cuda_graph' '--alpha %g --beta %g --num_layers %d --num_warmup %d --num_steps %d --use_cuda_graph --tune_gemm'
parameters = wrapper_params_format_str % ( parameters = wrapper_params_format_str % (
batch_size, input_size, hidden_size, alpha, beta, num_layers, num_warmup, num_steps batch_size, input_size, hidden_size, alpha, beta, num_layers, num_warmup, num_steps
) )
...@@ -161,6 +163,7 @@ def _test_dist_inference_command_generation(self, platform): ...@@ -161,6 +163,7 @@ def _test_dist_inference_command_generation(self, platform):
assert (benchmark._args.num_warmup == num_warmup) assert (benchmark._args.num_warmup == num_warmup)
assert (benchmark._args.num_steps == num_steps) assert (benchmark._args.num_steps == num_steps)
assert (benchmark._args.use_cuda_graph is True) assert (benchmark._args.use_cuda_graph is True)
assert (benchmark._args.tune_gemm is True)
# Check command # Check command
assert (1 == len(benchmark._commands)) assert (1 == len(benchmark._commands))
...@@ -168,7 +171,7 @@ def _test_dist_inference_command_generation(self, platform): ...@@ -168,7 +171,7 @@ def _test_dist_inference_command_generation(self, platform):
m, n, k = hidden_size, batch_size, input_size m, n, k = hidden_size, batch_size, input_size
bench_params_format_str = \ bench_params_format_str = \
'%s -m %d -n %d -k %d --alpha %g --beta %g ' + \ '%s -m %d -n %d -k %d --alpha %g --beta %g ' + \
'--num_layers %d --num_warmups %d --num_iters %d --use_cuda_graph' '--num_layers %d --num_warmups %d --num_iters %d --use_cuda_graph --tune_gemm'
assert ( assert (
cmd == ( cmd == (
bench_params_format_str % bench_params_format_str %
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment