#include #include #include #include #include #include #include #include "src/turbomind/layers/DenseWeight.h" #include "src/turbomind/utils/allocator.h" #include "src/turbomind/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/gemm.h" #include "src/turbomind/utils/logger.h" #include "src/turbomind/utils/memory_utils.h" using namespace turbomind; // Can be replaced by the function provided by a test framework class TestFailureError: public std::exception { private: std::string msg_; public: explicit TestFailureError() = default; explicit TestFailureError(std::string name, std::string msg = "") { msg_ = fmtstr("TEST FAIL [%s] %s", name.c_str(), msg.c_str()); } const char* what() const throw() { return msg_.c_str(); } }; #define EXPECT_TRUE(cond) \ do { \ if (!(cond)) { \ TM_LOG_ERROR("TEST FAIL [%s] at %s:%d", __func__, __FILE__, __LINE__); \ throw TestFailureError(__func__); \ } \ } while (false) #define EXPECT_ALMOST_EQUAL(name, dtype, ctype, out, ref) \ do { \ bool is_ok = checkResult(name, out, ref); \ if (!is_ok) { \ TM_LOG_ERROR("TEST FAIL [%s] at %s:%d", __func__, __FILE__, __LINE__); \ throw TestFailureError(__func__); \ } \ } while (false) //////////////////////////////////////////////////////////////////////////////////// // TensorWrapper is to handle a tensor object as well as its memory buffer, // because tensor.data is const we cannot set values. class TensorWrapper { private: IAllocator* allocator; public: std::vector shape; DataType type; Tensor* tensor; void* data; TensorWrapper(IAllocator* allocator, DataType dtype, std::vector shape, bool zero_init = false) { this->allocator = allocator; this->type = dtype; this->shape = shape; size_t tensor_memsize = this->memsize(); this->data = this->allocator->malloc(tensor_memsize, false); if (zero_init) { check_cuda_error(cudaMemset(data, 0x0, tensor_memsize)); } else { setRandomValues(); } this->tensor = new Tensor(MEMORY_GPU, dtype, shape, data); } TensorWrapper(TensorWrapper const& other): allocator(other.allocator), shape(other.shape), type(other.type), data(other.data), tensor(other.tensor) { TM_LOG_DEBUG("TensorWrapper copy: this=%p other=%p", data, other.data); } ~TensorWrapper() { delete tensor; allocator->free((void**)(&data)); } void setInvalidValues() { size_t type_size = tensor->type == TYPE_FP32 ? sizeof(float) : sizeof(half); size_t tensor_size = type_size * tensor->size(); // Fill by a random number to guarantee invalid values check_cuda_error(cudaMemset(data, 0xdc, tensor_size)); } void setRandomValues() { // random initialization size_t num_elements = this->size(); switch (this->type) { case TYPE_FP32: cudaRandomUniform((float*)data, num_elements); break; case TYPE_FP16: cudaRandomUniform((half*)data, num_elements); break; default: // Will be added more if needed. throw std::runtime_error("Not supported data type"); } } size_t size() { size_t n_elements = 1; for (size_t s : this->shape) { n_elements *= s; } return n_elements; } size_t memsize() { size_t type_size = 0; switch (this->type) { case TYPE_FP32: type_size = sizeof(float); break; case TYPE_FP16: type_size = sizeof(half); break; default: throw std::runtime_error("Not supported data type."); } return type_size * this->size(); } }; template void computeReference(GemmOp transa, GemmOp transb, TensorWrapper& C, TensorWrapper& A, TensorWrapper& B, float alpha = 1.0f, float beta = 0.0f) { size_t m = C.shape[0]; size_t n = C.shape[1]; size_t k = A.shape[1]; size_t lda = (transa == GEMM_OP_N) ? k : m; size_t ldb = (transb == GEMM_OP_N) ? n : k; size_t ldc = n; cudaDataType_t atype = (A.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F; cudaDataType_t btype = (B.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F; cudaDataType_t ctype = (C.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F; cudaDataType_t compute_type = (computeType == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F; cublasHandle_t cublas_handle; check_cuda_error(cublasCreate(&cublas_handle)); half h_alpha = (half)alpha; half h_beta = (half)beta; const void* _alpha = (computeType == TYPE_FP16) ? (const void*)&h_alpha : (const void*)α const void* _beta = (computeType == TYPE_FP16) ? (const void*)&h_beta : (const void*)β check_cuda_error(cublasGemmEx(cublas_handle, getCublasOperation(transb), getCublasOperation(transa), n, m, k, _alpha, (const void*)B.data, btype, ldb, (const void*)A.data, atype, lda, _beta, (void*)C.data, ctype, ldc, compute_type, CUBLAS_GEMM_DEFAULT)); check_cuda_error(cublasDestroy(cublas_handle)); cudaDeviceSynchronize(); } bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8) { // Params: a = value to compare and b = reference // This function follows implementation of numpy.isclose(), which checks // abs(a - b) <= (atol + rtol * abs(b)). // Note that the inequality above is asymmetric where b is considered as // a reference value. To account into both absolute/relative errors, it // uses absolute tolerance and relative tolerance at the same time. The // default values of atol and rtol borrowed from numpy.isclose(). For the // case of nan value, the result will be true. if (isnan(a) && isnan(b)) { return true; } return fabs(a - b) <= (atol + rtol * fabs(b)); } template bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, float atol, float rtol) { assert(out.type == ref.type); size_t out_size = out.size(); size_t ref_size = ref.size(); T* h_out = reinterpret_cast(malloc(sizeof(T) * out_size)); T* h_ref = reinterpret_cast(malloc(sizeof(T) * ref_size)); cudaMemcpy(h_out, out.data, sizeof(T) * out_size, cudaMemcpyDeviceToHost); cudaMemcpy(h_ref, ref.data, sizeof(T) * ref_size, cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); size_t failures = 0; for (size_t i = 0; i < out_size; ++i) { // The values for the output and the reference. float a = (float)h_out[i]; float b = (float)h_ref[i]; bool ok = almostEqual(a, b, atol, rtol); // Print the error. if (!ok && failures < 4) { TM_LOG_ERROR(">> invalid result for i=%lu:", i); TM_LOG_ERROR(">> found......: %10.6f", a); TM_LOG_ERROR(">> expected...: %10.6f", b); TM_LOG_ERROR(">> error......: %.6f", fabsf(a - b)); TM_LOG_ERROR(">> tol........: %.6f", atol + rtol * fabs(b)); } // Update the number of failures. failures += ok ? 0 : 1; } // Allow not matched up to 1% elements. size_t tol_failures = (size_t)(0.01 * out_size); TM_LOG_INFO("check....... %30s : %s (failures: %.2f%% atol: %.2e rtol: %.2e)", name.c_str(), failures <= tol_failures ? "OK" : "FAILED", 100. * failures / out_size, atol, rtol); return failures <= tol_failures; } template bool checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref) { float atol = (computeType == TYPE_FP32) ? 1e-6f : 1e-3f; float rtol = (computeType == TYPE_FP32) ? 1e-4f : 1e-1f; bool is_ok = false; if (sizeof(T) == 4) { is_ok = _checkResult(name, out, ref, atol, rtol); } else { is_ok = _checkResult(name, out, ref, atol, rtol); } return is_ok; } template bool checkResult(TensorWrapper& out, TensorWrapper& ref) { return checkResult("", out, ref); } template std::string toString() { std::string str = "dtype="; str += std::is_same::value ? "FP32" : "FP16"; return str; } template std::string toString() { std::string str = "dtype="; str += std::is_same::value ? "FP32" : "FP16"; str += ", compute_type="; str += (ctype == TYPE_FP32) ? "FP32" : "FP16"; return str; } std::string toString(GemmOp op) { return op == GEMM_OP_N ? "N" : "T"; } struct GemmOpPair { GemmOp transa; GemmOp transb; }; static const std::vector op_pairs{ {GEMM_OP_N, GEMM_OP_N}, {GEMM_OP_N, GEMM_OP_T}, {GEMM_OP_T, GEMM_OP_N}, {GEMM_OP_T, GEMM_OP_T}}; static inline std::string getTestName(const char* func_name, GemmOp transa, GemmOp transb, size_t m, size_t n, size_t k) { return fmtstr("%s [opA=%s, opB=%s, m=%ld, n=%ld, k=%ld]", func_name, getGemmOpString(transa).c_str(), getGemmOpString(transb).c_str(), m, n, k); } static inline std::string getTestName(const char* func_name, GemmOpPair op_pairs, size_t m, size_t n, size_t k) { return getTestName(func_name, op_pairs.transa, op_pairs.transb, m, n, k); } /////////////////////////////////// Unittests ////////////////////////////////////////// template void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k) { TM_LOG_INFO( "Matmul function correctness test [m=%ld, n=%ld, k=%ld, %s]", m, n, k, toString().c_str()); cudaStream_t stream; check_cuda_error(cudaStreamCreate(&stream)); Allocator allocator(getDevice()); DataType dtype = getTensorType(); TensorWrapper a_tensor(&allocator, dtype, {m, k}, false); TensorWrapper b_tensor(&allocator, dtype, {k, n}, false); TensorWrapper c_tensor(&allocator, dtype, {m, n}, true); TensorWrapper expected(&allocator, dtype, {m, n}, true); std::shared_ptr gemm = createGemm(&allocator, stream, false, false); gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType); for (auto& op_pair : op_pairs) { std::string tc_name = getTestName(__func__, op_pair, m, n, k); TM_LOG_DEBUG(tc_name); computeReference(op_pair.transa, op_pair.transb, expected, a_tensor, b_tensor); size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m; size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k; size_t ldc = n; c_tensor.setInvalidValues(); // to guarantee C has invalid data gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, a_tensor.type, lda, b_tensor.data, b_tensor.type, ldb, c_tensor.data, c_tensor.type, ldc); EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected); c_tensor.setInvalidValues(); gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_tensor.data, ldb, c_tensor.data, ldc); EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected); c_tensor.setInvalidValues(); gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_tensor.data, c_tensor.data); EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected); c_tensor.setInvalidValues(); gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, DenseWeight{(const T*)b_tensor.data, nullptr, nullptr}, c_tensor.data); EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected); } check_cuda_error(cudaStreamDestroy(stream)); } // template // void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) // { // // Test if Gemm is consistent with cublasWrapper // TM_LOG_INFO( // "Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]", m, n, k, toString().c_str()); // Allocator allocator(getDevice()); // cudaStream_t stream; // check_cuda_error(cudaStreamCreate(&stream)); // DataType dtype = getTensorType(); // TensorWrapper a_tensor(&allocator, dtype, {m, k}, false); // TensorWrapper b_tensor(&allocator, dtype, {k, n}, false); // TensorWrapper c_tensor(&allocator, dtype, {m, n}, true); // TensorWrapper expected(&allocator, dtype, {m, n}, true); // cublasHandle_t cublas_handle; // cublasLtHandle_t cublaslt_handle; // check_cuda_error(cublasCreate(&cublas_handle)); // check_cuda_error(cublasLtCreate(&cublaslt_handle)); // check_cuda_error(cublasSetStream(cublas_handle, stream)); // cublasAlgoMap cublas_algo_map(GEMM_CONFIG); // std::mutex* cublas_wrapper_mutex = new std::mutex(); // cublasMMWrapper cublas_wrapper( // cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator); // cudaDataType_t cuda_dtype = std::is_same::value ? CUDA_R_32F : CUDA_R_16F; // cudaDataType_t cuda_ctype = (DataType::TYPE_FP32 == computeType) ? CUDA_R_32F : CUDA_R_16F; // cublas_wrapper.setGemmConfig(cuda_dtype, cuda_dtype, cuda_dtype, cuda_ctype); // std::shared_ptr gemm = createGemm(&allocator, stream, false, false); // gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType); // for (auto& op_pair : op_pairs) { // std::string tc_name = getTestName(__func__, op_pair, m, n, k); // // Switch A/B because Gemm expects column major layout as cublas does. // size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m; // size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k; // size_t ldc = n; // cublas_wrapper.Gemm(getCublasOperation(op_pair.transb), // getCublasOperation(op_pair.transa), // n, // m, // k, // b_tensor.data, // ldb, // a_tensor.data, // lda, // expected.data, // ldc); // c_tensor.setInvalidValues(); // to guarantee C has invalid data // gemm->gemm(op_pair.transa, // op_pair.transb, // m, // n, // k, // a_tensor.data, // a_tensor.type, // lda, // b_tensor.data, // b_tensor.type, // ldb, // c_tensor.data, // c_tensor.type, // ldc); // EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected); // c_tensor.setInvalidValues(); // gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_tensor.data, ldb, c_tensor.data, ldc); // EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected); // c_tensor.setInvalidValues(); // gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_tensor.data, c_tensor.data); // EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected); // c_tensor.setInvalidValues(); // gemm->gemm(op_pair.transa, // op_pair.transb, // m, // n, // k, // a_tensor.data, // DenseWeight{(const T*)b_tensor.data, nullptr, nullptr}, // c_tensor.data); // EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected); // } // delete cublas_wrapper_mutex; // check_cuda_error(cublasLtDestroy(cublaslt_handle)); // check_cuda_error(cublasDestroy(cublas_handle)); // check_cuda_error(cudaStreamDestroy(stream)); // } // template // void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) // { // // Test if Gemm is consistent with cublasWrapper // TM_LOG_INFO("Batched gemm function consistency test [m=%ld, n=%ld, k=%ld, %s]", // m, // n, // k, // toString().c_str()); // Allocator allocator(getDevice()); // cudaStream_t stream; // check_cuda_error(cudaStreamCreate(&stream)); // // batch of in/out tensors // DataType a_type = getTensorType(); // DataType b_type = getTensorType(); // DataType c_type = getTensorType(); // std::vector a_tensors; // std::vector b_tensors; // std::vector c_tensors; // std::vector expecteds; // const size_t batch_size = 3; // for (size_t i = 0; i < batch_size; ++i) { // a_tensors.push_back(new TensorWrapper(&allocator, a_type, {m, k}, false)); // b_tensors.push_back(new TensorWrapper(&allocator, b_type, {k, n}, false)); // c_tensors.push_back(new TensorWrapper(&allocator, c_type, {m, n}, true)); // expecteds.push_back(new TensorWrapper(&allocator, c_type, {m, n}, true)); // } // const T* hA[]{(const T*)a_tensors[0]->data, // (const T*)a_tensors[1]->data, // (const T*)a_tensors[2]->data, // nullptr, // for memory alignment. // (const T*)b_tensors[0]->data, // (const T*)b_tensors[1]->data, // (const T*)b_tensors[2]->data, // nullptr, // for memory alignment. // (const T*)c_tensors[0]->data, // (const T*)c_tensors[1]->data, // (const T*)c_tensors[2]->data, // nullptr, // for memory alignment. // (const T*)expecteds[0]->data, // (const T*)expecteds[1]->data, // (const T*)expecteds[2]->data}; // T** batch_tensor_ptrs = reinterpret_cast(allocator.malloc(sizeof(T*) * 16, false)); // check_cuda_error(cudaMemcpyAsync((void*)batch_tensor_ptrs, hA, sizeof(T*) * 16, cudaMemcpyHostToDevice, stream)); // const void* const* batch_a = reinterpret_cast(batch_tensor_ptrs); // const void* const* batch_b = reinterpret_cast(batch_tensor_ptrs + 4); // void* const* batch_c = reinterpret_cast(batch_tensor_ptrs + 8); // void* const* batch_expected = reinterpret_cast(batch_tensor_ptrs + 12); // cublasHandle_t cublas_handle; // cublasLtHandle_t cublaslt_handle; // check_cuda_error(cublasCreate(&cublas_handle)); // check_cuda_error(cublasLtCreate(&cublaslt_handle)); // check_cuda_error(cublasSetStream(cublas_handle, stream)); // cublasAlgoMap cublas_algo_map(GEMM_CONFIG); // std::mutex* cublas_wrapper_mutex = new std::mutex(); // cublasMMWrapper cublas_wrapper( // cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator); // cudaDataType_t dtype = std::is_same::value ? CUDA_R_32F : CUDA_R_16F; // cudaDataType_t ctype = (computeType == DataType::TYPE_FP32) ? CUDA_R_32F : CUDA_R_16F; // cublas_wrapper.setGemmConfig(dtype, dtype, dtype, ctype); // std::shared_ptr gemm = createGemm(&allocator, stream, false, false); // gemm->setTypes(a_type, b_type, c_type, computeType); // for (auto& op_pair : op_pairs) { // std::string tc_name = getTestName(__func__, op_pair, m, n, k); // TM_LOG_DEBUG(tc_name); // size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m; // size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k; // size_t ldc = n; // // Switch A/B because Gemm expects column major layout as cublas does. // cublas_wrapper.batchedGemm(getCublasOperation(op_pair.transb), // N // getCublasOperation(op_pair.transa), // T // n, // m, // k, // (const void* const*)batch_b, // ldb, // (const void* const*)batch_a, // lda, // (void* const*)batch_expected, // ldc, // batch_size); // gemm->batchedGemm(op_pair.transa, // op_pair.transb, // m, // n, // k, // batch_a, // a_type, // lda, // batch_b, // b_type, // ldb, // batch_c, // c_type, // ldc, // batch_size); // for (size_t i = 0; i < batch_size; ++i) { // EXPECT_ALMOST_EQUAL( // tc_name + " api1 batch" + std::to_string(i), T, computeType, *c_tensors[i], *expecteds[i]); // } // for (size_t i = 0; i < batch_size; ++i) { // c_tensors[i]->setInvalidValues(); // } // gemm->batchedGemm( // op_pair.transa, op_pair.transb, m, n, k, batch_a, lda, batch_b, ldb, batch_c, ldc, batch_size); // for (size_t i = 0; i < batch_size; ++i) { // EXPECT_ALMOST_EQUAL( // tc_name + " api2 batch" + std::to_string(i), T, computeType, *c_tensors[i], *expecteds[i]); // } // for (size_t i = 0; i < batch_size; ++i) { // c_tensors[i]->setInvalidValues(); // } // gemm->batchedGemm(op_pair.transa, op_pair.transb, m, n, k, batch_a, batch_b, batch_c, batch_size); // for (size_t i = 0; i < batch_size; ++i) { // EXPECT_ALMOST_EQUAL( // tc_name + " api3 batch" + std::to_string(i), T, computeType, *c_tensors[i], *expecteds[i]); // } // } // a_tensors.clear(); // b_tensors.clear(); // c_tensors.clear(); // expecteds.clear(); // delete cublas_wrapper_mutex; // check_cuda_error(cublasLtDestroy(cublaslt_handle)); // check_cuda_error(cublasDestroy(cublas_handle)); // check_cuda_error(cudaStreamDestroy(stream)); // } // template // void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t n, size_t k) // { // // Test if Gemm is consistent with cublasWrapper // TM_LOG_INFO("Strided batched gemm function consistency test [bsz=%ld, m=%ld, n=%ld, k=%ld, %s]", // batch_size, // m, // n, // k, // toString().c_str()); // Allocator allocator(getDevice()); // cudaStream_t stream; // check_cuda_error(cudaStreamCreate(&stream)); // DataType data_type = getTensorType(); // TensorWrapper a_tensor(&allocator, data_type, {batch_size, m, k}, false); // TensorWrapper b_tensor(&allocator, data_type, {batch_size, k, n}, false); // TensorWrapper c_tensor(&allocator, data_type, {batch_size, m, n}, true); // TensorWrapper expected(&allocator, data_type, {batch_size, m, n}, true); // cublasHandle_t cublas_handle; // cublasLtHandle_t cublaslt_handle; // check_cuda_error(cublasCreate(&cublas_handle)); // check_cuda_error(cublasLtCreate(&cublaslt_handle)); // check_cuda_error(cublasSetStream(cublas_handle, stream)); // cublasAlgoMap cublas_algo_map(GEMM_CONFIG); // std::mutex* cublas_wrapper_mutex = new std::mutex(); // cublasMMWrapper cublas_wrapper( // cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator); // cudaDataType_t dtype = std::is_same::value ? CUDA_R_32F : CUDA_R_16F; // cudaDataType_t ctype = (computeType == DataType::TYPE_FP32) ? CUDA_R_32F : CUDA_R_16F; // cublas_wrapper.setGemmConfig(dtype, dtype, dtype, ctype); // std::shared_ptr gemm = createGemm(&allocator, stream, false, false); // gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType); // for (auto& op_pair : op_pairs) { // std::string tc_name = getTestName(__func__, op_pair, m, n, k); // // Switch A/B because Gemm expects column major layout as cublas does. // size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m; // size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k; // size_t ldc = n; // int64_t stridea = m * k; // int64_t strideb = k * n; // int64_t stridec = m * n; // float alpha = 1.0f; // float beta = 0.0f; // cublas_wrapper.stridedBatchedGemm(getCublasOperation(op_pair.transb), // getCublasOperation(op_pair.transa), // n, // m, // k, // alpha, // b_tensor.data, // getCublasDataType(b_tensor.type), // ldb, // strideb, // a_tensor.data, // getCublasDataType(a_tensor.type), // lda, // stridea, // beta, // expected.data, // getCublasDataType(expected.type), // ldc, // stridec, // batch_size, // getCublasDataType(computeType)); // c_tensor.setInvalidValues(); // to guarantee C has invalid data // gemm->stridedBatchedGemm(op_pair.transa, // op_pair.transb, // m, // n, // k, // a_tensor.data, // a_tensor.type, // lda, // stridea, // b_tensor.data, // b_tensor.type, // ldb, // strideb, // c_tensor.data, // c_tensor.type, // ldc, // stridec, // batch_size, // computeType, // alpha, // beta); // EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected); // c_tensor.setInvalidValues(); // gemm->stridedBatchedGemm(op_pair.transa, // op_pair.transb, // m, // n, // k, // a_tensor.data, // lda, // stridea, // b_tensor.data, // ldb, // strideb, // c_tensor.data, // ldc, // stridec, // batch_size, // alpha, // beta); // EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected); // c_tensor.setInvalidValues(); // gemm->stridedBatchedGemm(op_pair.transa, // op_pair.transb, // m, // n, // k, // a_tensor.data, // stridea, // b_tensor.data, // strideb, // c_tensor.data, // stridec, // batch_size, // alpha, // beta); // EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected); // c_tensor.setInvalidValues(); // gemm->stridedBatchedGemm(op_pair.transa, // op_pair.transb, // m, // n, // k, // a_tensor.data, // b_tensor.data, // c_tensor.data, // batch_size, // alpha, // beta); // EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected); // } // delete cublas_wrapper_mutex; // check_cuda_error(cublasLtDestroy(cublaslt_handle)); // check_cuda_error(cublasDestroy(cublas_handle)); // check_cuda_error(cudaStreamDestroy(stream)); // } #ifdef SPARSITY_ENABLED // The current SpGemm only supports TYPE_FP16 for T, computeType, // but let us keep these template variables for later use. template void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) { TM_LOG_INFO( "Sparse gemm function correctness test [m=%ld, n=%ld, k=%ld, %s]", m, n, k, toString().c_str()); cudaStream_t stream; check_cuda_error(cudaStreamCreate(&stream)); Allocator allocator(getDevice()); DataType dtype = getTensorType(); TensorWrapper a_tensor(&allocator, dtype, {m, k}, false); TensorWrapper b_tensor(&allocator, dtype, {k, n}, false); TensorWrapper c_tensor(&allocator, dtype, {m, n}, true); TensorWrapper expected(&allocator, dtype, {m, n}, true); std::shared_ptr gemm = createGemm(&allocator, stream, true, false); gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType); for (auto& op_pair : op_pairs) { // A/B will be switched in SpGemm. std::string tc_name = getTestName(__func__, op_pair, m, n, k); TM_LOG_DEBUG(tc_name); b_tensor.setRandomValues(); pruneMatrixB(b_tensor.data, stream, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb); computeReference(op_pair.transa, op_pair.transb, expected, a_tensor, b_tensor); void* b_compressed; compressMatrixB( &b_compressed, allocator, stream, b_tensor.data, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb); size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m; size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k; size_t ldc = n; c_tensor.setInvalidValues(); // to guarantee C has invalid data gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, a_tensor.type, lda, b_compressed, b_tensor.type, ldb, c_tensor.data, c_tensor.type, ldc); EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected); c_tensor.setInvalidValues(); gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_compressed, ldb, c_tensor.data, ldc); EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected); c_tensor.setInvalidValues(); gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_compressed, c_tensor.data); EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected); c_tensor.setInvalidValues(); gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, DenseWeight{(const T*)b_tensor.data, nullptr, (const T*)b_compressed}, c_tensor.data); EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected); allocator.free((void**)(&b_compressed)); } check_cuda_error(cudaStreamDestroy(stream)); } // template // void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) // { // // Test if Gemm is consistent with cublasWrapper // TM_LOG_INFO("Sparse Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]", // m, // n, // k, // toString().c_str()); // Allocator allocator(getDevice()); // cudaStream_t stream; // check_cuda_error(cudaStreamCreate(&stream)); // DataType dtype = getTensorType(); // TensorWrapper a_tensor(&allocator, dtype, {m, k}, false); // TensorWrapper b_tensor(&allocator, dtype, {k, n}, false); // TensorWrapper c_tensor(&allocator, dtype, {m, n}, true); // TensorWrapper expected(&allocator, dtype, {m, n}, true); // cublasHandle_t cublas_handle; // cublasLtHandle_t cublaslt_handle; // check_cuda_error(cublasCreate(&cublas_handle)); // check_cuda_error(cublasLtCreate(&cublaslt_handle)); // check_cuda_error(cublasSetStream(cublas_handle, stream)); // cublasAlgoMap cublas_algo_map(GEMM_CONFIG); // std::mutex* cublas_wrapper_mutex = new std::mutex(); // cublasMMWrapper cublas_wrapper( // cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator); // cudaDataType_t cu_dtype = std::is_same::value ? CUDA_R_32F : CUDA_R_16F; // cudaDataType_t cu_ctype = (DataType::TYPE_FP32 == computeType) ? CUDA_R_32F : CUDA_R_16F; // cublas_wrapper.setGemmConfig(cu_dtype, cu_dtype, cu_dtype, cu_ctype); // std::shared_ptr gemm = createGemm(&allocator, stream, true, false); // gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType); // for (auto& op_pair : op_pairs) { // std::string tc_name = getTestName(__func__, op_pair, m, n, k); // TM_LOG_DEBUG(tc_name); // b_tensor.setRandomValues(); // pruneMatrixB(b_tensor.data, stream, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb); // // Switch A/B because Gemm expects column major layout as cublas does. // size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m; // size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k; // size_t ldc = n; // cublas_wrapper.Gemm(getCublasOperation(op_pair.transb), // getCublasOperation(op_pair.transa), // n, // m, // k, // b_tensor.data, // ldb, // a_tensor.data, // lda, // expected.data, // ldc); // void* b_compressed; // compressMatrixB( // &b_compressed, allocator, stream, b_tensor.data, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb); // c_tensor.setInvalidValues(); // to guarantee C has invalid data // gemm->gemm(op_pair.transa, // op_pair.transb, // m, // n, // k, // a_tensor.data, // a_tensor.type, // lda, // b_compressed, // b_tensor.type, // ldb, // c_tensor.data, // c_tensor.type, // ldc); // EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected); // c_tensor.setInvalidValues(); // gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_compressed, ldb, c_tensor.data, ldc); // EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected); // c_tensor.setInvalidValues(); // gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_compressed, c_tensor.data); // EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected); // } // delete cublas_wrapper_mutex; // check_cuda_error(cublasLtDestroy(cublaslt_handle)); // check_cuda_error(cublasDestroy(cublas_handle)); // check_cuda_error(cudaStreamDestroy(stream)); // } #endif int main(int argc, char* argv[]) { // testGemmCreate(); using testcase_t = std::tuple; std::vector testcases = { {16, 32, 64}, {255, 255, 255}, {1041, 2047, 9999}, {1041, 1, 9999}, {1041, 999, 1}}; // Computation correctness tests for (testcase_t& tc : testcases) { size_t m = std::get<0>(tc); size_t n = std::get<1>(tc); size_t k = std::get<2>(tc); testGemmCorrectnessMatmul(m, n, k); testGemmCorrectnessMatmul(m, n, k); testGemmCorrectnessMatmul(m, n, k); // testGemmConsistencyMatmul(m, n, k); // testGemmConsistencyMatmul(m, n, k); // testGemmConsistencyMatmul(m, n, k); // testGemmConsistencyBatchedMatmul(m, n, k); // testGemmConsistencyBatchedMatmul(m, n, k); // testGemmConsistencyBatchedMatmul(m, n, k); // testGemmConsistencyStridedBatchedMatmul(7, m, n, k); // testGemmConsistencyStridedBatchedMatmul(7, m, n, k); // testGemmConsistencyStridedBatchedMatmul(7, m, n, k); } #ifdef SPARSITY_ENABLED // Reset for SpGemm test. testcases.clear(); testcases.insert(testcases.end(), {{8, 32, 32}, // minimum possible example. {8, 32, 64}, {64, 64, 64}, {16, 32, 64}, {1024, 32, 1024}, {1024, 1024, 32}, {16, 1024, 1024}, {1024, 1024, 1024}}); for (testcase_t& tc : testcases) { size_t m = std::get<0>(tc); size_t n = std::get<1>(tc); size_t k = std::get<2>(tc); testSpGemmCorrectnessMatmul(m, n, k); // testSpGemmConsistencyMatmul(m, n, k); } #endif TM_LOG_INFO("Test done"); return 0; }