/* * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include "encoder_igemm_func.h" // TODO(bhsueh) Remove this include #include "src/turbomind/utils/cublasAlgoMap.h" #include "src/turbomind/utils/cuda_utils.h" #include #include #include #ifdef ENABLE_BF16 #include #endif #ifdef ENABLE_FP8 #include #endif #include #include #include #include #include namespace turbomind { // Scale Type Converter // is_fp16_compute_type is only valid when T = half template struct ScaleTypeConverter { using Type = float; }; template<> struct ScaleTypeConverter { using Type = half; }; template int LtHgemmCustomFind(cublasLtHandle_t ltHandle, int batch_size, int seq_len, int head_num, int size_per_head, int m, int n, int k, const scaleT* alpha, /* host pointer */ const T* A, const T* B, const scaleT* beta, /* host pointer */ T* C, void* workSpace, size_t workSpaceSize, FILE* fout, customMatmulPerf_t perfResults[], int AlgoCombinations, cudaDataType_t dtype_fp8 = CUDA_R_32F, int batchCount = 1, int64_t strideA = 0, int64_t strideB = 0, int64_t strideD = 0); size_t calGemmTestBufSizeInByte(int batch_size, int seq_len, int head_num, int size_per_head, int inter_size, int vocab_size, int int8_mode, CublasDataType data_type); size_t calGemmTestBufSizeInByteXlnet( int batch_size, int seq_len, int head_num, int size_per_head, int inter_size, int hidden_units, int is_fp16); int printPerfStructure(int batch_size, int seq_len, int head_num, int size_per_head, int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, CublasDataType data_type, int hasPrint, int batch_count = 1); } // namespace turbomind