Benchmarks: micro benchmarks - add int8 support for cublaslt function (#574)

**Description** add int8 support for cublaslt function.

Benchmarks: micro benchmarks - add int8 support for cublaslt function (#574)
**Description** add int8 support for cublaslt function.
f53d941a · Yuting Jiang · GitHub · c7800bb8 · f53d941a · f53d941a
Unverified Commit f53d941a authored Nov 20, 2023 by Yuting Jiang Committed by GitHub Nov 20, 2023
4 changed files
--- a/superbench/benchmarks/micro_benchmarks/cublaslt_function.py
+++ b/superbench/benchmarks/micro_benchmarks/cublaslt_function.py
@@ -23,7 +23,7 @@ class CublasLtBenchmark(MicroBenchmarkWithInvoke):
        super().__init__(name, parameters)
        self._bin_name = 'cublaslt_gemm'
-        self._in_types = ['fp64', 'fp32', 'fp16', 'bf16', 'fp8e4m3', 'fp8e5m2']
+        self._in_types = ['fp64', 'fp32', 'fp16', 'bf16', 'fp8e4m3', 'fp8e5m2', 'int8']
    def mrange(self, start, stop=-1, multiplication_factor=2):
        """Range constructor with multiplication factor.

--- a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu
+++ b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu
@@ -16,6 +16,7 @@ using fp16 = half;
 using bf16 = nv_bfloat16;
 using fp8e4m3 = __nv_fp8_e4m3;
 using fp8e5m2 = __nv_fp8_e5m2;
+using int8 = int8_t;
 struct Args {
    int m = 16;
@@ -84,6 +85,8 @@ template <typename T> cudaDataType_t get_datatype() {
        return CUDA_R_8F_E4M3;
    if (std::is_same<T, fp8e5m2>::value)
        return CUDA_R_8F_E5M2;
+    if (std::is_same<T, int8>::value)
+        return CUDA_R_8I;
    throw std::invalid_argument("Unknown type");
 }
@@ -162,6 +165,8 @@ int main(int argc, char **argv) {
        run<fp8e4m3, fp8e4m3, fp16>(&args);
    else if (args.in_type == "fp8e5m2")
        run<fp8e5m2, fp8e4m3, fp16>(&args);
+    else if (args.in_type == "int8")
+        run<int8>(&args);
    else
        throw std::invalid_argument("Unknown type " + args.in_type);

--- a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.cc
+++ b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.cc
@@ -62,6 +62,8 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l
        gemm_compute_type = CUBLAS_COMPUTE_32F;
    if (a_type == CUDA_R_64F || b_type == CUDA_R_64F)
        gemm_compute_type = CUBLAS_COMPUTE_64F;
+    if (a_type == CUDA_R_8I)
+        gemm_compute_type = CUBLAS_COMPUTE_32I;
    cublasLtMatmulDesc_t op_desc = nullptr;
    CUBLAS_CHECK(cublasLtMatmulDescCreate(&op_desc, gemm_compute_type, CUDA_R_32F));

--- a/tests/benchmarks/micro_benchmarks/test_cublaslt_function.py
+++ b/tests/benchmarks/micro_benchmarks/test_cublaslt_function.py
@@ -63,15 +63,15 @@ class CublasLtBenchmarkTestCase(BenchmarkTestCase, unittest.TestCase):
        (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
        benchmark = benchmark_cls(
            self.benchmark_name,
-            parameters='--batch 2:16:2 --shapes 2:4,4:8,8:32 32:128:4,128,128 --in_types fp16 fp32 fp64',
+            parameters='--batch 2:16:2 --shapes 2:4,4:8,8:32 32:128:4,128,128 --in_types fp16 fp32 fp64 int8',
        )
        self.assertTrue(benchmark._preprocess())
-        self.assertEqual(4 * (2 * 2 * 3 + 2) * 3, len(benchmark._commands))
+        self.assertEqual(4 * (2 * 2 * 3 + 2) * len(benchmark._args.in_types), len(benchmark._commands))
        def cmd(t, b, m, n, k):
            return f'{benchmark._CublasLtBenchmark__bin_path} -m {m} -n {n} -k {k} -b {b} -w 20 -i 50 -t {t}'
-        for _t in ['fp16', 'fp32', 'fp64']:
+        for _t in ['fp16', 'fp32', 'fp64', 'int8']:
            for _b in [2, 4, 8, 16]:
                for _m in [2, 4]:
                    for _n in [4, 8]: