/* * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include "src/fastertransformer/utils/Tensor.h" #include "src/fastertransformer/utils/cuda_fp8_utils.h" #include "src/fastertransformer/utils/cuda_utils.h" namespace fastertransformer { template void deviceMalloc(T** ptr, size_t size, bool is_random_initialize = true); template void deviceMemSetZero(T* ptr, size_t size); template void deviceFree(T*& ptr); template void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream = 0); template void cudaD2Hcpy(T* tgt, const T* src, const size_t size); template void cudaH2Dcpy(T* tgt, const T* src, const size_t size); template void cudaD2Dcpy(T* tgt, const T* src, const size_t size); template void cudaAutoCpy(T* tgt, const T* src, const size_t size, cudaStream_t stream = NULL); template void cudaRandomUniform(T* buffer, const size_t size); template int loadWeightFromBin(T* ptr, std::vector shape, std::string filename, FtCudaDataType model_file_type = FtCudaDataType::FP32); std::vector loadArrayFromBin(std::vector shape, std::string filename); // template // int loadWeightFromBinAndQuantizeForWeightOnly(int8_t* quantized_weight_ptr, // T* scale_ptr, // std::vector shape, // std::string filename, // FtCudaDataType model_file_type = FtCudaDataType::FP32); void invokeCudaD2DcpyHalf2Float(float* dst, half* src, const size_t size, cudaStream_t stream); void invokeCudaD2DcpyFloat2Half(half* dst, float* src, const size_t size, cudaStream_t stream); #ifdef ENABLE_FP8 void invokeCudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream); void invokeCudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream); void invokeCudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size, cudaStream_t stream); void invokeCudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size, cudaStream_t stream); void invokeCudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size, cudaStream_t stream); #endif // ENABLE_FP8 #ifdef ENABLE_BF16 void invokeCudaD2DcpyBfloat2Float(float* dst, __nv_bfloat16* src, const size_t size, cudaStream_t stream); #endif // ENABLE_BF16 template void invokeCudaCast(T_OUT* dst, T_IN const* const src, const size_t size, cudaStream_t stream); template __inline__ __host__ __device__ size_t dim2flat(const T (&idx)[n_dims], const T (&dims)[n_dims]) { size_t flat_idx = 0; for (size_t i = 0; i < n_dims; i++) { flat_idx += idx[i]; if (i + 1 < n_dims) flat_idx *= dims[i + 1]; } return flat_idx; } template __inline__ __host__ __device__ void flat2dim(T1 flat_idx, const T2 (&dims)[n_dims], T2 (&idx)[n_dims]) { for (int i = n_dims - 1; i >= 0; i--) { idx[i] = flat_idx % dims[i]; flat_idx /= dims[i]; } } template void invokeInPlaceTranspose(T* data, T* workspace, const int dim0, const int dim1); template void invokeInPlaceTranspose0213(T* data, T* workspace, const int dim0, const int dim1, const int dim2, const int dim3); template void invokeInPlaceTranspose102(T* data, T* workspace, const int dim0, const int dim1, const int dim2); template void invokeMultiplyScale(T* tensor, float scale, const size_t size, cudaStream_t stream); template void invokeDivideScale(T* tensor, float scale, const size_t size, cudaStream_t stream); template void invokeCudaD2DcpyConvert(T_OUT* tgt, const T_IN* src, const size_t size, cudaStream_t stream = 0); template void invokeCudaD2DScaleCpyConvert( T_OUT* tgt, const T_IN* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream = 0); inline bool checkIfFileExist(const std::string& file_path) { std::ifstream in(file_path, std::ios::in | std::ios::binary); if (in.is_open()) { in.close(); return true; } return false; } template void saveToBinary(const T* ptr, const size_t size, std::string filename); template void invokeFakeCast(T_IN* input_ptr, const size_t size, cudaStream_t stream); size_t cuda_datatype_size(FtCudaDataType dt); template bool invokeCheckRange(T* buffer, const size_t size, T min, T max, bool* d_within_range, cudaStream_t stream); } // namespace fastertransformer