Unverified Commit 9efcac38 authored by Li Zhang's avatar Li Zhang Committed by GitHub
Browse files

check-in fastertransformer (#7)

* add ft code

* gitignore

* fix lint

* revert fmha
parent 720fc533
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
#pragma once
#include "src/fastertransformer/models/llama/LlamaV2.h"
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h"
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
#include <memory>
namespace ft = fastertransformer;
template<typename T>
struct LlamaTritonSharedModelInstance {
std::unique_ptr<ft::LlamaV2<T>> llm;
std::shared_ptr<ft::LlamaWeight<T>> llm_weight;
std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator;
std::unique_ptr<ft::cublasAlgoMap> cublas_algo_map;
std::unique_ptr<std::mutex> cublas_wrapper_mutex;
std::unique_ptr<ft::cublasMMWrapper> cublas_wrapper;
std::unique_ptr<cudaDeviceProp> cuda_device_prop_ptr;
const int session_len;
};
template<typename T>
struct LlamaTritonModelInstance: AbstractTransformerModelInstance {
LlamaTritonModelInstance(std::shared_ptr<LlamaTritonSharedModelInstance<T>> instance,
std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator);
~LlamaTritonModelInstance();
std::shared_ptr<std::vector<triton::Tensor>>
forward(std::shared_ptr<std::vector<triton::Tensor>> input_tensors) override;
std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors) override;
std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors,
ft::AbstractInstanceComm*) override;
static std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
convert_outputs(const std::unordered_map<std::string, ft::Tensor>& output_tensors);
private:
const std::shared_ptr<LlamaTritonSharedModelInstance<T>> instance_;
const std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator_;
std::unordered_map<std::string, ft::Tensor>
convert_inputs(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors);
void allocateBuffer(const size_t request_batch_size, const size_t beam_width, const size_t session_len);
void freeBuffer();
int* d_input_ids_ = nullptr;
int* d_input_lengths_ = nullptr;
int* d_input_bad_words_ = nullptr;
int* d_input_stop_words_ = nullptr;
int* d_request_prompt_lengths_ = nullptr;
T* d_request_prompt_embedding_ = nullptr;
float* d_top_p_decay_ = nullptr;
float* d_top_p_min_ = nullptr;
int* d_top_p_reset_ids_ = nullptr;
int* d_output_ids_ = nullptr;
int* d_sequence_lengths_ = nullptr;
float* d_output_log_probs_ = nullptr;
float* d_cum_log_probs_ = nullptr;
uint32_t* h_total_output_lengths_ = nullptr;
std::exception_ptr h_exception_ = nullptr;
};
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.cpp
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
#include "src/fastertransformer/utils/nccl_utils.h"
std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>
AbstractTransformerModel::createNcclParams(const int node_id, const int device_id_start, const bool multi_node)
{
const int gpu_count = ft::getDeviceCount();
const int tensor_para_size = getTensorParaSize();
const int pipeline_para_size = getPipelineParaSize();
const int local_comm_size = multi_node ? gpu_count : tensor_para_size * pipeline_para_size;
ft::FT_CHECK(tensor_para_size > 0 && pipeline_para_size > 0);
ft::FT_CHECK(device_id_start + (int)local_comm_size <= gpu_count);
std::vector<ft::NcclUid> nccl_ids;
if (tensor_para_size > 1 || pipeline_para_size > 1) {
nccl_ids.resize(tensor_para_size + pipeline_para_size);
if (node_id == 0) {
for (uint32_t i = 0; i < nccl_ids.size(); i++) {
ft::ftNcclGetUniqueId(nccl_ids[i]);
}
}
for (size_t i = 0; i < nccl_ids.size(); i++) {
ft::mpi::bcast(&nccl_ids[i], sizeof(nccl_ids[i]), ft::mpi::MPI_TYPE_BYTE, 0, ft::mpi::COMM_WORLD);
}
}
std::vector<ft::NcclParam> tensor_para_params(local_comm_size);
std::vector<ft::NcclParam> pipeline_para_params(local_comm_size);
// Don't init comm when size == 1
if (tensor_para_size > 1) {
const auto group_id = ft::ftNcclNextGroupId();
ft::ftNcclGroupStart();
for (int gid = device_id_start; gid < device_id_start + local_comm_size; gid++) {
int rank = node_id * gpu_count + gid - device_id_start;
int tensor_para_rank = rank % tensor_para_size;
int pipeline_para_rank = rank / tensor_para_size;
ft::NcclUid tensor_para_nccl_uid = nccl_ids[pipeline_para_rank];
ft::check_cuda_error(cudaSetDevice(gid));
ft::ftNcclCommInitRank(
tensor_para_params[gid - device_id_start], tensor_para_rank, tensor_para_size, tensor_para_nccl_uid);
tensor_para_params[gid - device_id_start].group_id_ = group_id;
}
ft::ftNcclGroupEnd();
}
if (pipeline_para_size > 1) {
const auto group_id = ft::ftNcclNextGroupId();
ft::ftNcclGroupStart();
for (int gid = device_id_start; gid < device_id_start + local_comm_size; gid++) {
int rank = node_id * gpu_count + gid - device_id_start;
int tensor_para_rank = rank % tensor_para_size;
int pipeline_para_rank = rank / tensor_para_size;
ft::NcclUid pipeline_para_nccl_uid = nccl_ids[pipeline_para_size + tensor_para_rank];
ft::check_cuda_error(cudaSetDevice(gid));
ft::ftNcclCommInitRank(pipeline_para_params[gid - device_id_start],
pipeline_para_rank,
pipeline_para_size,
pipeline_para_nccl_uid);
pipeline_para_params[gid - device_id_start].group_id_ = group_id;
}
ft::ftNcclGroupEnd();
}
return std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>(tensor_para_params, pipeline_para_params);
}
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.hpp
#pragma once
#include <memory>
#include <sstream>
#include <sys/time.h>
#include <vector>
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/custom_ar_comm.h"
#include "src/fastertransformer/utils/instance_comm.h"
#include "src/fastertransformer/utils/mpi_utils.h"
#include "src/fastertransformer/utils/nccl_utils.h"
namespace ft = fastertransformer;
namespace triton {
#ifdef USE_TRITONSERVER_DATATYPE
#include "triton/core/tritonbackend.h"
#include "triton/core/tritonserver.h"
#ifndef TRITONSERVER_API_VERSION_MAJOR
#error TRITONSERVER_API_VERSION_MAJOR Undefined!
#endif
#ifndef TRITONSERVER_API_VERSION_MINOR
#error TRITONSERVER_API_VERSION_MINOR Undefined!
#endif
#if (TRITONSERVER_API_VERSION_MAJOR == 1 && TRITONSERVER_API_VERSION_MINOR >= 17) \
|| (TRITONSERVER_API_VERSION_MAJOR > 1)
#define ENABLE_TRITON_BF16 1
#endif
typedef TRITONSERVER_DataType DataType;
typedef TRITONSERVER_MemoryType MemoryType;
constexpr TRITONSERVER_DataType TYPE_INVALID = TRITONSERVER_TYPE_INVALID;
constexpr TRITONSERVER_DataType TYPE_BOOL = TRITONSERVER_TYPE_BOOL;
constexpr TRITONSERVER_DataType TYPE_UINT8 = TRITONSERVER_TYPE_UINT8;
constexpr TRITONSERVER_DataType TYPE_UINT16 = TRITONSERVER_TYPE_UINT16;
constexpr TRITONSERVER_DataType TYPE_UINT32 = TRITONSERVER_TYPE_UINT32;
constexpr TRITONSERVER_DataType TYPE_UINT64 = TRITONSERVER_TYPE_UINT64;
constexpr TRITONSERVER_DataType TYPE_INT8 = TRITONSERVER_TYPE_INT8;
constexpr TRITONSERVER_DataType TYPE_INT16 = TRITONSERVER_TYPE_INT16;
constexpr TRITONSERVER_DataType TYPE_INT32 = TRITONSERVER_TYPE_INT32;
constexpr TRITONSERVER_DataType TYPE_INT64 = TRITONSERVER_TYPE_INT64;
constexpr TRITONSERVER_DataType TYPE_FP16 = TRITONSERVER_TYPE_FP16;
constexpr TRITONSERVER_DataType TYPE_FP32 = TRITONSERVER_TYPE_FP32;
constexpr TRITONSERVER_DataType TYPE_FP64 = TRITONSERVER_TYPE_FP64;
constexpr TRITONSERVER_DataType TYPE_BYTES = TRITONSERVER_TYPE_BYTES;
#ifdef ENABLE_TRITON_BF16
constexpr TRITONSERVER_DataType TYPE_BF16 = TRITONSERVER_TYPE_BF16;
#endif
constexpr TRITONSERVER_MemoryType MEMORY_CPU = TRITONSERVER_MEMORY_CPU;
constexpr TRITONSERVER_MemoryType MEMORY_CPU_PINNED = TRITONSERVER_MEMORY_CPU_PINNED;
constexpr TRITONSERVER_MemoryType MEMORY_GPU = TRITONSERVER_MEMORY_GPU;
#else
typedef ft::DataType DataType;
typedef ft::MemoryType MemoryType;
constexpr DataType TYPE_INVALID = ft::TYPE_INVALID;
constexpr DataType TYPE_BOOL = ft::TYPE_BOOL;
constexpr DataType TYPE_UINT8 = ft::TYPE_UINT8;
constexpr DataType TYPE_UINT16 = ft::TYPE_UINT16;
constexpr DataType TYPE_UINT32 = ft::TYPE_UINT32;
constexpr DataType TYPE_UINT64 = ft::TYPE_UINT64;
constexpr DataType TYPE_INT8 = ft::TYPE_INT8;
constexpr DataType TYPE_INT16 = ft::TYPE_INT16;
constexpr DataType TYPE_INT32 = ft::TYPE_INT32;
constexpr DataType TYPE_INT64 = ft::TYPE_INT64;
constexpr DataType TYPE_FP16 = ft::TYPE_FP16;
constexpr DataType TYPE_FP32 = ft::TYPE_FP32;
constexpr DataType TYPE_FP64 = ft::TYPE_FP64;
constexpr DataType TYPE_BYTES = ft::TYPE_BYTES;
constexpr DataType TYPE_BF16 = ft::TYPE_BF16;
constexpr MemoryType MEMORY_CPU = ft::MEMORY_CPU;
constexpr MemoryType MEMORY_CPU_PINNED = ft::MEMORY_CPU_PINNED;
constexpr MemoryType MEMORY_GPU = ft::MEMORY_GPU;
#endif
struct Tensor {
const MemoryType where;
const DataType type;
const std::vector<size_t> shape;
const void* data;
Tensor(const MemoryType _where, const DataType _type, const std::vector<size_t> _shape, const void* _data):
where(_where), type(_type), shape(_shape), data(_data)
{
}
static ft::DataType convertTritonTypeToFt(DataType tmp_type)
{
ft::DataType ft_data_type;
switch (tmp_type) {
case TYPE_INVALID:
ft_data_type = ft::DataType::TYPE_INVALID;
break;
case TYPE_BOOL:
ft_data_type = ft::DataType::TYPE_BOOL;
break;
case TYPE_UINT8:
ft_data_type = ft::DataType::TYPE_UINT8;
break;
case TYPE_UINT16:
ft_data_type = ft::DataType::TYPE_UINT16;
break;
case TYPE_UINT32:
ft_data_type = ft::DataType::TYPE_UINT32;
break;
case TYPE_UINT64:
ft_data_type = ft::DataType::TYPE_UINT64;
break;
case TYPE_INT8:
ft_data_type = ft::DataType::TYPE_INT8;
break;
case TYPE_INT16:
ft_data_type = ft::DataType::TYPE_INT16;
break;
case TYPE_INT32:
ft_data_type = ft::DataType::TYPE_INT32;
break;
case TYPE_INT64:
ft_data_type = ft::DataType::TYPE_INT64;
break;
case TYPE_FP16:
ft_data_type = ft::DataType::TYPE_FP16;
break;
case TYPE_FP32:
ft_data_type = ft::DataType::TYPE_FP32;
break;
case TYPE_FP64:
ft_data_type = ft::DataType::TYPE_FP64;
break;
#ifdef ENABLE_TRITON_BF16
case TYPE_BF16:
ft_data_type = ft::DataType::TYPE_BF16;
break;
#endif
case TYPE_BYTES:
ft_data_type = ft::DataType::TYPE_BYTES;
break;
default:
FT_CHECK_WITH_INFO(false, "Unknown data type with type id: " + std::to_string(tmp_type));
break;
}
return ft_data_type;
}
ft::Tensor convertTritonTensorToFt()
{
ft::DataType ft_data_type = convertTritonTypeToFt(type);
ft::MemoryType ft_memory_type;
switch (where) {
case MEMORY_CPU:
ft_memory_type = ft::MemoryType::MEMORY_CPU;
break;
case MEMORY_CPU_PINNED:
ft_memory_type = ft::MemoryType::MEMORY_CPU_PINNED;
break;
case MEMORY_GPU:
ft_memory_type = ft::MemoryType::MEMORY_GPU;
break;
}
return ft::Tensor{ft_memory_type, ft_data_type, shape, data};
}
static Tensor convertFtTensorToTriton(ft::Tensor ft_tensor)
{
DataType triton_data_type;
switch (ft_tensor.type) {
case TYPE_INVALID:
triton_data_type = TYPE_INVALID;
break;
case TYPE_BOOL:
triton_data_type = TYPE_BOOL;
break;
case TYPE_UINT8:
triton_data_type = TYPE_UINT8;
break;
case TYPE_UINT16:
triton_data_type = TYPE_UINT16;
break;
case TYPE_UINT32:
triton_data_type = TYPE_UINT32;
break;
case TYPE_UINT64:
triton_data_type = TYPE_UINT64;
break;
case TYPE_INT8:
triton_data_type = TYPE_INT8;
break;
case TYPE_INT16:
triton_data_type = TYPE_INT16;
break;
case TYPE_INT32:
triton_data_type = TYPE_INT32;
break;
case TYPE_INT64:
triton_data_type = TYPE_INT64;
break;
case TYPE_FP16:
triton_data_type = TYPE_FP16;
break;
case TYPE_FP32:
triton_data_type = TYPE_FP32;
break;
case TYPE_FP64:
triton_data_type = TYPE_FP64;
break;
#ifdef ENABLE_TRITON_BF16
case TYPE_BF16:
triton_data_type = TYPE_BF16;
break;
#endif
case TYPE_BYTES:
triton_data_type = TYPE_BYTES;
break;
default:
FT_CHECK_WITH_INFO(false, "Unknown data type with type id: " + std::to_string(ft_tensor.type));
break;
}
MemoryType triton_memory_type;
switch (ft_tensor.where) {
case MEMORY_CPU:
triton_memory_type = MEMORY_CPU;
break;
case MEMORY_CPU_PINNED:
triton_memory_type = MEMORY_CPU_PINNED;
break;
case MEMORY_GPU:
triton_memory_type = MEMORY_GPU;
break;
}
return Tensor{triton_memory_type, triton_data_type, ft_tensor.shape, ft_tensor.data};
}
};
} // namespace triton
using triton_stream_cb_t = void(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>, void*);
struct AbstractTransformerModel;
struct AbstractTransformerModelInstance;
struct AbstractTransformerModelInstance {
virtual std::shared_ptr<std::vector<triton::Tensor>>
forward(std::shared_ptr<std::vector<triton::Tensor>> input_tensors) = 0;
virtual std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors) = 0;
virtual std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors, ft::AbstractInstanceComm*)
{
return forward(input_tensors);
}
void registerCallback(triton_stream_cb_t* cb, void* ctx)
{
stream_cb_ = cb;
stream_ctx_ = ctx;
}
void unRegisterCallback()
{
stream_cb_ = nullptr;
stream_ctx_ = nullptr;
}
triton_stream_cb_t* stream_cb_ = nullptr;
void* stream_ctx_ = nullptr;
};
struct AbstractTransformerModel {
static std::shared_ptr<AbstractTransformerModel> createLlamaModel(std::string model_dir);
virtual std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>
createNcclParams(const int node_id, const int device_id_start = 0, const bool multi_node = false);
virtual void createCustomComms(std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms,
int world_size) = 0;
virtual std::unique_ptr<ft::AbstractInstanceComm> createInstanceComm(int size)
{
return nullptr;
}
virtual std::unique_ptr<AbstractTransformerModelInstance>
createModelInstance(int deviceId,
int rank,
cudaStream_t stream,
std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm = nullptr) = 0;
virtual void createSharedWeights(int deviceId, int rank) = 0;
virtual std::string toString() = 0;
virtual int getTensorParaSize() = 0;
virtual int getPipelineParaSize() = 0;
};
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
#include "src/fastertransformer/utils/Tensor.h"
namespace ft = fastertransformer;
template<typename T>
void move_tensor_H2D(const triton::Tensor& tensor,
T*& d_ptr,
const std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>>* allocator)
{
if (tensor.where == triton::MEMORY_GPU) {
return;
}
size_t tensor_size = 1;
for (auto t : tensor.shape) {
tensor_size *= t;
}
cudaStream_t stream = (*allocator)->returnStream();
d_ptr = (T*)((*allocator)->reMalloc(d_ptr, sizeof(T) * tensor_size, false));
ft::check_cuda_error(cudaMemcpyAsync(d_ptr, (T*)tensor.data, sizeof(T) * tensor_size, cudaMemcpyDefault, stream));
}
template<typename T>
ft::Tensor as_GPU_tensor(const triton::Tensor& tensor, T* d_ptr)
{
return ft::Tensor{ft::MEMORY_GPU,
triton::Tensor::convertTritonTypeToFt(tensor.type),
tensor.shape,
tensor.where == triton::MEMORY_CPU ? d_ptr : tensor.data};
}
inline ft::Tensor as_CPU_tensor(const triton::Tensor& tensor)
{
return ft::Tensor{ft::MEMORY_CPU, triton::Tensor::convertTritonTypeToFt(tensor.type), tensor.shape, tensor.data};
}
# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
cmake_minimum_required(VERSION 3.8)
add_subdirectory(gemm_test)
add_library(cuda_utils STATIC cuda_utils.cc)
set_property(TARGET cuda_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET cuda_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(cuda_utils PUBLIC -lcudart)
add_library(logger STATIC logger.cc)
set_property(TARGET logger PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET logger PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(logger PUBLIC -lcudart)
add_library(cublasAlgoMap STATIC cublasAlgoMap.cc)
set_property(TARGET cublasAlgoMap PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET cublasAlgoMap PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(cublasAlgoMap PUBLIC -lcublas -lcudart -lcurand cuda_utils logger)
add_library(cublasMMWrapper STATIC cublasMMWrapper.cc)
set_property(TARGET cublasMMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET cublasMMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(cublasMMWrapper PUBLIC -lcublas -lcudart -lcurand cublasAlgoMap cuda_utils logger)
if (SPARSITY_SUPPORT)
target_link_libraries(cublasMMWrapper PUBLIC -lcusparse -lcusparseLt)
endif()
add_library(word_list STATIC word_list.cc)
set_property(TARGET word_list PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET word_list PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(nvtx_utils STATIC nvtx_utils.cc)
set_property(TARGET nvtx_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET nvtx_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(nvtx_utils PUBLIC -lnvToolsExt)
add_library(memory_utils STATIC memory_utils.cu)
set_property(TARGET memory_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET memory_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(memory_utils PUBLIC cuda_utils logger tensor)
add_library(mpi_utils STATIC mpi_utils.cc)
set_property(TARGET mpi_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET mpi_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
if (BUILD_MULTI_GPU)
target_link_libraries(mpi_utils PUBLIC mpi logger)
endif()
add_library(nccl_utils STATIC nccl_utils.cc)
set_property(TARGET nccl_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET nccl_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
if (BUILD_MULTI_GPU)
target_link_libraries(nccl_utils PUBLIC ${NCCL_LIBRARIES} mpi_utils logger)
endif()
add_library(cublasINT8MMWrapper STATIC cublasINT8MMWrapper.cc)
set_property(TARGET cublasINT8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET cublasINT8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(cublasINT8MMWrapper PUBLIC -lcublasLt -lcudart -lcurand cublasAlgoMap cublasMMWrapper cuda_utils logger)
if(ENABLE_FP8)
add_library(cublasFP8MMWrapper STATIC cublasFP8MMWrapper.cu)
set_property(TARGET cublasFP8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET cublasFP8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(cublasFP8MMWrapper PUBLIC -lcublasLt -lcudart -lcurand
cublasAlgoMap cublasMMWrapper nvtx_utils fp8_qgmma_1x1_utils)
endif()
add_library(custom_ar_comm STATIC custom_ar_comm.cc)
set_property(TARGET custom_ar_comm PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET custom_ar_comm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(custom_ar_comm PUBLIC custom_ar_kernels memory_utils cuda_utils logger)
add_library(gemm STATIC gemm.cc)
set_property(TARGET gemm PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET gemm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(gemm PUBLIC
-lcublas -lcublasLt -lcudart -lcurand
cublasAlgoMap memory_utils cuda_utils logger)
if (SPARSITY_SUPPORT)
target_link_libraries(gemm PUBLIC -lcusparse -lcusparseLt)
endif()
add_library(cuda_fp8_utils STATIC cuda_fp8_utils.cu)
set_property(TARGET cuda_fp8_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET cuda_fp8_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(tensor STATIC Tensor.cc)
set_property(TARGET tensor PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET tensor PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(tensor PUBLIC cuda_utils logger)
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
namespace fastertransformer {
enum IA3_config {
KEY_ADAPTER = 1 << 0,
VALUE_ADAPTER = 1 << 1,
MLP_ADAPTER = 1 << 2,
};
static constexpr IA3_config IA3_NONE = static_cast<IA3_config>(0);
static constexpr size_t IA3_ADAPTER_MAX_NUM_ENCODER = 3;
static constexpr size_t IA3_ADAPTER_MAX_NUM_DECODER = 5;
static inline IA3_config operator&(IA3_config x, IA3_config y)
{
return static_cast<IA3_config>(static_cast<int>(x) & static_cast<int>(y));
}
static inline IA3_config operator|(IA3_config x, IA3_config y)
{
return static_cast<IA3_config>(static_cast<int>(x) | static_cast<int>(y));
}
static inline IA3_config& operator|=(IA3_config& x, IA3_config y)
{
return x = static_cast<IA3_config>(static_cast<int>(x) | static_cast<int>(y));
}
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "stdlib.h"
namespace fastertransformer {
#define ACTIVATION_AMAX_NUM 72
#define INT8O_GEMM_NUM 8
#define TRT_AMAX_NUM 3
#define SCALE_RESERVE_NUM 21
struct ScaleList {
// Part 1 -- 72:
// First 72 are for activation amaxs. For each activation amax, there are 4 values: amax, amax/127.0f,
// amax/127.0f/127.0f, 127.0f/amax -- input_amax 0-3 , Q_aftergemm_amax 4-7, Qbias_amax 8-11, K_aftergemm_amax
// 12-15, Kbias_amax 16-19, V_aftergemm_amax 20-23, Vbias_amax 24-27, bmm1_amax 28-31, Softmax_amax 32-35,
// bmm2_amax 36-39, Proj_aftergemm_scale 40-43, ProjBiasNorm_amax 44-47, FC1_aftergemm_amax 48-51, F1Bias_amax
// 52-55, FC2_aftergemm_amax 56-59, F2BiasNorm_amax 60-63, reserve 64-71
// Part 2 -- 9*hidden_dim:
// Kernel amaxs, for each kernel amax list, there are output_channel values : query_weight_amax_list,
// key_weight_amax_list, value_weight_amax_list, proj_weight_amax_list, FC1_weight_amax_list, FC2_weight_amax_list
// Part 3 -- 8:
// Int8 gemm deQFactor list (8 values): Q_deQ_scale, K_deQ_scale, V_deQ_scale, bmm1_deQ_scale, bmm2_deQ_scale,
// FC0_deQ_scale, FC1_deQ_scale, FC2_deQ_scale
// Part 4 -- 3:
// Amax used in trt fused mha kernel (3 values) : QKVbias_amax, Softmax_amax, bmm2_amax
// Part 5 -- 21: reverse
const float* d_scale_list_ = nullptr;
const float* h_scale_list_ = nullptr;
size_t size_ = ACTIVATION_AMAX_NUM + 9 * 768 + INT8O_GEMM_NUM + TRT_AMAX_NUM;
size_t p2_offset_ = ACTIVATION_AMAX_NUM;
size_t p3_offset_ = ACTIVATION_AMAX_NUM + 9 * 768;
size_t p4_offset_ = ACTIVATION_AMAX_NUM + 9 * 768 + INT8O_GEMM_NUM;
};
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/string_utils.h"
#include "stdlib.h"
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <dirent.h>
#include <numeric>
#include <stdlib.h>
#include <string>
#include <sys/stat.h>
#include <sys/types.h>
#include <unordered_map>
#include <vector>
namespace fastertransformer {
Tensor::Tensor():
// a none tensor.
where(MEMORY_CPU),
type(TYPE_INVALID),
shape({}),
data(nullptr),
offsets({}) // only a record to record offset
{
}
Tensor::Tensor(const MemoryType _where, const DataType _type, const std::vector<size_t> _shape, const void* _data):
where(_where), type(_type), shape(_shape), data(_data)
{
}
Tensor::Tensor(const MemoryType _where,
const DataType _type,
const std::vector<size_t> _shape,
const void* _data,
const std::vector<size_t> _offset):
where(_where), type(_type), shape(_shape), data(_data), offsets(_offset)
{
}
void Tensor::parseNpyIntro(FILE*& f_ptr, uint32_t& header_len, uint32_t& start_data)
{
const char magic[] = "\x93"
"NUMPY";
char magic_test[sizeof(magic)] = "\0";
size_t n_elems = fread((void*)magic_test, sizeof(char), sizeof(magic) - 1, f_ptr);
if (n_elems != sizeof(magic) - 1 || std::string(magic) != std::string(magic_test)) {
throw std::runtime_error("Could read magic token in NPY file");
}
uint8_t npy_major = 0;
uint8_t npy_minor = 0;
n_elems = fread((void*)&npy_major, sizeof(uint8_t), 1, f_ptr);
n_elems += fread((void*)&npy_minor, sizeof(uint8_t), 1, f_ptr);
if (npy_major == 1) {
uint16_t header_len_u16 = 0;
n_elems = fread((void*)&header_len_u16, sizeof(uint16_t), 1, f_ptr);
header_len = header_len_u16;
}
else if (npy_major == 2) {
uint32_t header_len_u32 = 0;
n_elems = fread((void*)&header_len_u32, sizeof(uint32_t), 1, f_ptr);
header_len = header_len_u32;
}
else {
throw std::runtime_error("Unsupported npy version: " + std::to_string(npy_major));
}
start_data = 8 + 2 * npy_major + header_len;
}
int Tensor::parseNpyHeader(FILE*& f_ptr, uint32_t header_len, DataType& type, std::vector<size_t>& shape)
{
char* header_c = (char*)malloc(header_len * sizeof(char));
size_t n_elems = fread((void*)header_c, sizeof(char), header_len, f_ptr);
if (n_elems != header_len) {
free(header_c);
return -1;
}
std::string header(header_c, header_len);
free(header_c);
size_t start, end;
start = header.find("'descr'") + 7;
start = header.find("'", start);
end = header.find("'", start + 1);
type = typeFromNumpyDesc(header.substr(start + 1, end - start - 1));
start = header.find("'fortran_order'") + 15;
start = header.find(":", start);
end = header.find(",", start + 1);
if (header.substr(start + 1, end - start - 1).find("False") == std::string::npos) {
throw std::runtime_error("Unsupported value for fortran_order while reading npy file");
}
start = header.find("'shape'") + 7;
start = header.find("(", start);
end = header.find(")", start + 1);
std::istringstream shape_stream(header.substr(start + 1, end - start - 1));
std::string token;
shape.clear();
while (std::getline(shape_stream, token, ',')) {
if (token.find_first_not_of(' ') == std::string::npos) {
break;
}
shape.push_back(std::stoul(token));
}
return 0;
}
Tensor Tensor::loadNpy(const std::string& npy_file, const MemoryType where)
{
DataType type;
std::vector<size_t> shape;
FILE* f_ptr = fopen(npy_file.c_str(), "rb");
if (f_ptr == nullptr) {
throw std::runtime_error("Could not open file " + npy_file);
}
uint32_t header_len, start_data;
parseNpyIntro(f_ptr, header_len, start_data);
parseNpyHeader(f_ptr, header_len, type, shape);
const size_t size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
void* data_cpu = malloc(size * Tensor::getTypeSize(type));
void* data = data_cpu;
size_t n_elems = fread(data_cpu, Tensor::getTypeSize(type), size, f_ptr);
FT_CHECK_WITH_INFO(n_elems == size, "reading tensor failed");
if (where == MEMORY_GPU) {
cudaMalloc(&data, size * Tensor::getTypeSize(type));
cudaMemcpy(data, data_cpu, size * Tensor::getTypeSize(type), cudaMemcpyHostToDevice);
free(data_cpu);
}
fclose(f_ptr);
return Tensor(where, type, shape, data);
}
size_t Tensor::size() const
{
if (data == nullptr || shape.size() == 0) {
return 0;
}
return std::accumulate(shape.begin(), shape.end(), (size_t)1, std::multiplies<size_t>());
}
size_t Tensor::sizeBytes() const
{
return size() * Tensor::getTypeSize(type);
}
std::string Tensor::whereToString() const
{
static const std::unordered_map<MemoryType, std::string> mem_to_string{
{MEMORY_CPU, "CPU"}, {MEMORY_CPU_PINNED, "CPU_PINNED"}, {MEMORY_GPU, "GPU"}};
return mem_to_string.at(where);
}
std::string Tensor::toString() const
{
std::string memtype_str = whereToString();
static const std::unordered_map<DataType, std::string> type_to_string{
{TYPE_BOOL, "BOOL"},
{TYPE_UINT8, "UINT8"},
{TYPE_UINT16, "UINT16"},
{TYPE_UINT32, "UINT32"},
{TYPE_UINT64, "UINT64"},
{TYPE_INT8, "INT8"},
{TYPE_INT16, "INT16"},
{TYPE_INT32, "INT32"},
{TYPE_INT64, "INT64"},
{TYPE_BF16, "BF16"},
{TYPE_FP16, "FP16"},
{TYPE_FP32, "FP32"},
{TYPE_FP64, "FP64"},
{TYPE_BYTES, "BYTES"},
{TYPE_INVALID, "INVALID"},
{TYPE_FP8_E4M3, "E4M3"},
{TYPE_VOID, "VOID"},
};
return fmtstr("Tensor[where=%s, type=%s, shape=%s, data=%p]",
memtype_str.c_str(),
type_to_string.at(type).c_str(),
vec2str(shape).c_str(),
data);
}
DataType Tensor::typeFromNumpyDesc(std::string type)
{
static const std::unordered_map<std::string, DataType> type_map{{"?", TYPE_BOOL},
{"b", TYPE_BYTES},
{"u1", TYPE_UINT8},
{"u2", TYPE_UINT16},
{"u4", TYPE_UINT32},
{"u8", TYPE_UINT64},
{"i1", TYPE_INT8},
{"i2", TYPE_INT16},
{"i4", TYPE_INT32},
{"i8", TYPE_INT64},
{"f2", TYPE_FP16},
{"f4", TYPE_FP32},
{"f8", TYPE_FP64}};
return type_map.at(type);
}
size_t Tensor::getTypeSize(DataType type)
{
static const std::unordered_map<DataType, size_t> type_map{{TYPE_BOOL, sizeof(bool)},
{TYPE_BYTES, sizeof(char)},
{TYPE_UINT8, sizeof(uint8_t)},
{TYPE_UINT16, sizeof(uint16_t)},
{TYPE_UINT32, sizeof(uint32_t)},
{TYPE_UINT64, sizeof(uint64_t)},
{TYPE_INT8, sizeof(int8_t)},
{TYPE_INT16, sizeof(int16_t)},
{TYPE_INT32, sizeof(int32_t)},
{TYPE_INT64, sizeof(int64_t)},
#ifdef ENABLE_BF16
{TYPE_BF16, sizeof(__nv_bfloat16)},
#endif
#ifdef ENABLE_FP8
{TYPE_FP8_E4M3, sizeof(__nv_fp8_e4m3)},
#endif
{TYPE_FP16, sizeof(half)},
{TYPE_FP32, sizeof(float)},
{TYPE_FP64, sizeof(double)}};
return type_map.at(type);
}
std::string Tensor::getNumpyTypeDesc(DataType type) const
{
static const std::unordered_map<DataType, std::string> type_map{{TYPE_INVALID, "x"},
{TYPE_BOOL, "?"},
{TYPE_BYTES, "b"},
{TYPE_UINT8, "u1"},
{TYPE_UINT16, "u2"},
{TYPE_UINT32, "u4"},
{TYPE_UINT64, "u8"},
{TYPE_INT8, "i1"},
{TYPE_INT16, "i2"},
{TYPE_INT32, "i4"},
{TYPE_INT64, "i8"},
{TYPE_FP16, "f2"},
{TYPE_FP32, "f4"},
{TYPE_FP64, "f8"}};
if (type == TYPE_BF16) {
FT_LOG_WARNING("getNumpyTypeDesc(TYPE_BF16) returns an invalid type 'x' since Numpy doesn't "
"support bfloat16 as of now, it will be properly extended if numpy supports. "
"Please refer for the discussions https://github.com/numpy/numpy/issues/19808.");
}
return type_map.count(type) > 0 ? type_map.at(type) : "x";
}
void Tensor::saveNpy(const std::string& filename) const
{
// Save tensor to NPY 1.0 format (see https://numpy.org/neps/nep-0001-npy-format.html)
void* cpu_data = (void*)data;
bool is_data_temp = false;
size_t tensor_size = size();
if (where == MemoryType::MEMORY_GPU) {
cpu_data = malloc(tensor_size * Tensor::getTypeSize(type));
is_data_temp = true;
cudaDeviceSynchronize();
cudaMemcpy(cpu_data, data, tensor_size * Tensor::getTypeSize(type), cudaMemcpyDeviceToHost);
}
const char magic[] = "\x93"
"NUMPY";
const uint8_t npy_major = 1;
const uint8_t npy_minor = 0;
std::stringstream header_stream;
header_stream << "{'descr': '" << getNumpyTypeDesc(type) << "', 'fortran_order': False, 'shape': (";
for (size_t i = 0; i < shape.size(); ++i) {
header_stream << shape[i];
if (i + 1 < shape.size() || shape.size() == 1) {
header_stream << ", ";
}
}
header_stream << ")}";
int base_length = 6 + 4 + header_stream.str().size();
int pad_length = 16 * ((base_length + 1 + 15) / 16); // Take ceiling of base_length + 1 (for '\n' ending)
for (int i = 0; i < pad_length - base_length; ++i) {
header_stream << ((i == pad_length - base_length - 1) ? "\n" : "\x20");
}
std::string header = header_stream.str();
const uint16_t header_len = header.size();
FILE* f_ptr = fopen(filename.c_str(), "wb");
FT_CHECK_WITH_INFO(f_ptr != nullptr, fmtstr("Unable to open %s for writing.\n", filename.c_str()));
fwrite(magic, sizeof(char), sizeof(magic) - 1, f_ptr);
fwrite(&npy_major, sizeof(uint8_t), 1, f_ptr);
fwrite(&npy_minor, sizeof(uint8_t), 1, f_ptr);
fwrite(&header_len, sizeof(uint16_t), 1, f_ptr);
fwrite(header.c_str(), sizeof(char), header_len, f_ptr);
fwrite(cpu_data, Tensor::getTypeSize(type), tensor_size, f_ptr);
fclose(f_ptr);
if (is_data_temp) {
free(cpu_data);
}
}
Tensor Tensor::slice(std::vector<size_t> shape, size_t offset) const
{
if (this->data != nullptr) {
size_t n_elts = this->size();
size_t n_sliced_elts = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
FT_CHECK_WITH_INFO(
n_sliced_elts + offset <= n_elts,
fmtstr("The number (%ld) of elements of sliced tensor exceeds that (%ld) of the original tensor",
n_sliced_elts + offset,
n_elts));
}
return Tensor(this->where, this->type, shape, this->getPtrWithOffset(offset));
}
TensorMap::TensorMap(const std::unordered_map<std::string, Tensor>& tensor_map)
{
for (auto& kv : tensor_map) {
if (isValid(kv.second)) {
insert(kv.first, kv.second);
}
else {
FT_LOG_DEBUG(fmtstr("%s is not a valid tensor, skipping insert into TensorMap", kv.first.c_str()));
}
}
}
TensorMap::TensorMap(const std::vector<Tensor>& tensor_map)
{
for (size_t i = 0; i < tensor_map.size(); i++) {
insert(std::to_string(i), tensor_map[i]);
}
}
TensorMap::TensorMap(std::initializer_list<std::pair<std::string, Tensor>> tensor_map)
{
for (auto& pair : tensor_map) {
if (isValid(pair.second)) {
insert(pair.first, pair.second);
}
else {
FT_LOG_DEBUG(fmtstr("%s is not a valid tensor, skipping insert into TensorMap", pair.first.c_str()));
}
}
}
TensorMap::~TensorMap()
{
tensor_map_.clear();
}
std::vector<std::string> TensorMap::keys() const
{
std::vector<std::string> key_names;
for (auto& kv : tensor_map_) {
key_names.push_back(kv.first);
}
return key_names;
}
std::string TensorMap::toString()
{
std::stringstream ss;
ss << "{";
std::vector<std::string> key_names = keys();
for (size_t i = 0; i < tensor_map_.size(); ++i) {
ss << key_names[i] << ": " << at(key_names[i]).toString();
if (i < tensor_map_.size() - 1) {
ss << ", ";
}
}
ss << "}";
return ss.str();
}
TensorMap TensorMap::fromNpyFolder(const std::string& base_folder)
{
DIR* dir_p = opendir(base_folder.c_str());
FT_CHECK_WITH_INFO(dir_p != nullptr, fmtstr("Could not open folder %s. ", base_folder.c_str()));
struct dirent* dp;
TensorMap ret_tensor;
while ((dp = readdir(dir_p)) != nullptr) {
std::string filename(dp->d_name);
size_t len = filename.length();
if (len < 4 || filename.compare(len - 4, 4, ".npy")) {
continue;
}
size_t pos = filename.find('-');
FT_CHECK_WITH_INFO(pos != std::string::npos, fmtstr("Invalid filename: %s\n", filename.c_str()));
MemoryType where;
if (filename.compare(0, pos, "GPU") == 0) {
where = MEMORY_GPU;
}
else if (filename.compare(0, pos, "CPU") == 0) {
where = MEMORY_CPU;
}
else if (filename.compare(0, pos, "CPU_PINNED") == 0) {
where = MEMORY_CPU_PINNED;
}
else {
FT_CHECK_WITH_INFO(false, fmtstr("Invalid filename: %s\n", filename.c_str()));
}
std::string key = filename.substr(pos + 1, len - pos - 5);
ret_tensor.tensor_map_.insert({key, Tensor::loadNpy(base_folder + "/" + filename, where)});
}
closedir(dir_p);
return ret_tensor;
}
void TensorMap::saveNpy(const std::string& base_folder)
{
mode_t mode_0755 = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH;
int ret = mkdir(base_folder.c_str(), mode_0755);
FT_CHECK_WITH_INFO(ret == 0 || errno == EEXIST, fmtstr("Could not create folder %s.\n", base_folder.c_str()));
for (const auto& item : tensor_map_) {
item.second.saveNpy(base_folder + "/" + item.second.whereToString() + "-" + item.first + ".npy");
}
}
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_fp8_utils.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/string_utils.h"
#include "stdlib.h"
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <dirent.h>
#include <numeric>
#include <stdlib.h>
#include <string>
#include <sys/stat.h>
#include <sys/types.h>
#include <unordered_map>
#include <vector>
namespace fastertransformer {
typedef enum datatype_enum
{
TYPE_INVALID,
TYPE_BOOL,
TYPE_UINT8,
TYPE_UINT16,
TYPE_UINT32,
TYPE_UINT64,
TYPE_INT8,
TYPE_INT16,
TYPE_INT32,
TYPE_INT64,
TYPE_FP16,
TYPE_FP32,
TYPE_FP64,
TYPE_BYTES,
TYPE_BF16,
TYPE_FP8_E4M3,
TYPE_STR,
TYPE_VOID,
} DataType;
template<typename T>
DataType getTensorType()
{
if (std::is_same<T, float>::value || std::is_same<T, const float>::value) {
return TYPE_FP32;
}
else if (std::is_same<T, half>::value || std::is_same<T, const half>::value) {
return TYPE_FP16;
}
#ifdef ENABLE_BF16
else if (std::is_same<T, __nv_bfloat16>::value || std::is_same<T, const __nv_bfloat16>::value) {
return TYPE_BF16;
}
#endif
#ifdef ENABLE_FP8
else if (std::is_same<T, __nv_fp8_e4m3>::value || std::is_same<T, const __nv_fp8_e4m3>::value) {
return TYPE_FP8_E4M3;
}
#endif
else if (std::is_same<T, int>::value || std::is_same<T, const int>::value) {
return TYPE_INT32;
}
else if (std::is_same<T, int8_t>::value || std::is_same<T, const int8_t>::value) {
return TYPE_INT8;
}
else if (std::is_same<T, uint>::value || std::is_same<T, const uint>::value) {
return TYPE_UINT32;
}
else if (std::is_same<T, unsigned long long int>::value || std::is_same<T, const unsigned long long int>::value) {
return TYPE_UINT64;
}
else if (std::is_same<T, bool>::value || std::is_same<T, const bool>::value) {
return TYPE_BOOL;
}
else if (std::is_same<T, char>::value || std::is_same<T, const char>::value) {
return TYPE_BYTES;
}
else {
return TYPE_INVALID;
}
}
typedef enum memorytype_enum
{
MEMORY_CPU,
MEMORY_CPU_PINNED,
MEMORY_GPU
} MemoryType;
struct Tensor {
const MemoryType where;
const DataType type;
const std::vector<size_t> shape;
const void* data; // TODO(bhseuh) modify from const void* to void* const
const std::vector<size_t> offsets = std::vector<size_t>{};
Tensor();
Tensor(const MemoryType _where, const DataType _type, const std::vector<size_t> _shape, const void* _data);
Tensor(const MemoryType _where,
const DataType _type,
const std::vector<size_t> _shape,
const void* _data,
const std::vector<size_t> _offset);
size_t size() const;
size_t sizeBytes() const;
std::string whereToString() const;
std::string toString() const;
std::string getNumpyTypeDesc(DataType type) const;
void saveNpy(const std::string& filename) const;
static Tensor loadNpy(const std::string& npy_file, const MemoryType where);
static DataType typeFromNumpyDesc(std::string type);
static size_t getTypeSize(DataType type);
template<typename T>
inline T getVal(size_t index) const
{
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
FT_CHECK(where == MEMORY_CPU);
FT_CHECK(data != nullptr);
FT_CHECK_WITH_INFO(index < size(), "index is larger than buffer size");
if (getTensorType<T>() != type) {
FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
getNumpyTypeDesc(getTensorType<T>()).c_str(),
getNumpyTypeDesc(type).c_str());
}
return ((T*)data)[index];
}
template<typename T>
inline T getVal() const
{
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
if (getTensorType<T>() != type) {
FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
getNumpyTypeDesc(getTensorType<T>()).c_str(),
getNumpyTypeDesc(type).c_str());
}
return getVal<T>(0);
}
template<typename T>
inline T* getPtr() const
{
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
if (getTensorType<T>() != type) {
FT_LOG_DEBUG("getPtr with type %s, but data type is: %s",
getNumpyTypeDesc(getTensorType<T>()).c_str(),
getNumpyTypeDesc(type).c_str());
}
return (T*)data;
}
inline void* getPtrWithOffset(size_t offset) const
{
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
if (data == nullptr) {
return (void*)data;
}
else {
FT_CHECK_WITH_INFO(offset < size(), "offset is larger than buffer size");
return (void*)((char*)data + offset * Tensor::getTypeSize(type));
}
}
template<typename T>
inline T* getPtrWithOffset(size_t offset) const
{
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
if (getTensorType<T>() != type) {
FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
getNumpyTypeDesc(getTensorType<T>()).c_str(),
getNumpyTypeDesc(type).c_str());
}
if (data == nullptr) {
return (T*)data;
}
else {
FT_CHECK_WITH_INFO(offset < size(),
fmtstr("offset (%lu) is larger than buffer size (%lu)", offset, size()));
return ((T*)data) + offset;
}
}
template<typename T>
T max() const
{
if (getTensorType<T>() != type) {
FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
getNumpyTypeDesc(getTensorType<T>()).c_str(),
getNumpyTypeDesc(type).c_str());
}
FT_CHECK_WITH_INFO(shape.size() > 0 && data != nullptr, "Should be a non-empty tensor.");
FT_CHECK_WITH_INFO(where == MEMORY_CPU || where == MEMORY_CPU_PINNED,
"max() supports MEMORY_CPU or MEMORY_CPU_PINNED tensor.");
size_t max_idx = 0;
T max_val = getVal<T>(max_idx);
for (size_t i = 1; i < size(); ++i) {
T val = getVal<T>(i);
if (val > max_val) {
max_idx = i;
max_val = val;
}
}
return max_val;
}
template<typename T>
T min() const
{
if (getTensorType<T>() != type) {
FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
getNumpyTypeDesc(getTensorType<T>()).c_str(),
getNumpyTypeDesc(type).c_str());
}
FT_CHECK_WITH_INFO(shape.size() > 0 && data != nullptr, "Should be a non-empty tensor.");
FT_CHECK_WITH_INFO(where == MEMORY_CPU || where == MEMORY_CPU_PINNED,
"min() supports MEMORY_CPU or MEMORY_CPU_PINNED tensor.");
size_t min_idx = 0;
T min_val = getVal<T>(min_idx);
for (size_t i = 1; i < size(); ++i) {
T val = getVal<T>(i);
if (val < min_val) {
min_idx = i;
min_val = val;
}
}
return min_val;
}
template<typename T>
T any(T val) const
{
if (getTensorType<T>() != type) {
FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
getNumpyTypeDesc(getTensorType<T>()).c_str(),
getNumpyTypeDesc(type).c_str());
}
FT_CHECK_WITH_INFO(shape.size() > 0 && data != nullptr, "Should be a non-empty tensor.");
FT_CHECK_WITH_INFO(where == MEMORY_CPU || where == MEMORY_CPU_PINNED,
"any() supports MEMORY_CPU or MEMORY_CPU_PINNED tensor.");
for (size_t i = 0; i < size(); ++i) {
if (getVal<T>(i) == val) {
return true;
}
}
return false;
}
template<typename T>
T all(T val) const
{
if (getTensorType<T>() != type) {
FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
getNumpyTypeDesc(getTensorType<T>()).c_str(),
getNumpyTypeDesc(type).c_str());
}
FT_CHECK_WITH_INFO(shape.size() > 0 && data != nullptr, "Should be a non-empty tensor.");
FT_CHECK_WITH_INFO(where == MEMORY_CPU || where == MEMORY_CPU_PINNED,
"all() supports MEMORY_CPU or MEMORY_CPU_PINNED tensor.");
for (size_t i = 0; i < size(); ++i) {
if (getVal<T>(i) != val) {
return false;
}
}
return true;
}
void updateShape(size_t idx, size_t val)
{
// TODO: find a better way to update the shape
std::vector<size_t>& shape_ref = const_cast<std::vector<size_t>&>(shape);
shape_ref[idx] = val;
}
Tensor slice(std::vector<size_t> shape, size_t offset = 0) const;
private:
static void parseNpyIntro(FILE*& f_ptr, uint32_t& header_len, uint32_t& start_data);
static int parseNpyHeader(FILE*& f_ptr, uint32_t header_len, DataType& type, std::vector<size_t>& shape);
};
class TensorMap {
private:
std::unordered_map<std::string, Tensor> tensor_map_;
inline bool isValid(const Tensor& tensor)
{
return tensor.size() > 0 && tensor.data != nullptr;
}
public:
TensorMap() = default;
TensorMap(const std::unordered_map<std::string, Tensor>& tensor_map);
TensorMap(const std::vector<Tensor>& tensor_map);
TensorMap(std::initializer_list<std::pair<std::string, Tensor>> tensor_map);
~TensorMap();
inline size_t size() const
{
return tensor_map_.size();
}
inline bool isExist(const std::string& key) const
{
FT_LOG_DEBUG("%s for key: %s", __PRETTY_FUNCTION__, key.c_str());
return tensor_map_.find(key) != tensor_map_.end();
}
std::vector<std::string> keys() const;
inline void insert(const std::string& key, const Tensor& value)
{
FT_CHECK_WITH_INFO(!isExist(key), fmtstr("Duplicated key %s", key.c_str()));
FT_CHECK_WITH_INFO(isValid(value), fmtstr("A none tensor or nullptr is not allowed (key is %s)", key.c_str()));
tensor_map_.insert({key, value});
}
inline void insertIfValid(const std::string& key, const Tensor& value)
{
if (isValid(value)) {
insert({key, value});
}
}
inline void insert(std::pair<std::string, Tensor> p)
{
tensor_map_.insert(p);
}
// prevent converting int or size_t to string automatically
Tensor at(int tmp) = delete;
Tensor at(size_t tmp) = delete;
inline Tensor& at(const std::string& key)
{
FT_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
FT_CHECK_WITH_INFO(isExist(key),
fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
key.c_str(),
vec2str(keys()).c_str()));
return tensor_map_.at(key);
}
inline Tensor at(const std::string& key) const
{
FT_CHECK_WITH_INFO(isExist(key),
fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
key.c_str(),
vec2str(keys()).c_str()));
return tensor_map_.at(key);
}
inline Tensor& at(const std::string& key, Tensor& default_tensor)
{
FT_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
if (isExist(key)) {
return tensor_map_.at(key);
}
return default_tensor;
}
inline Tensor at(const std::string& key, Tensor& default_tensor) const
{
FT_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
if (isExist(key)) {
return tensor_map_.at(key);
}
return default_tensor;
}
inline Tensor& at(const std::string& key, Tensor&& default_tensor)
{
FT_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
if (isExist(key)) {
return tensor_map_.at(key);
}
return default_tensor;
}
inline Tensor at(const std::string& key, Tensor&& default_tensor) const
{
if (isExist(key)) {
return tensor_map_.at(key);
}
return default_tensor;
}
template<typename T>
inline T getVal(const std::string& key) const
{
FT_CHECK_WITH_INFO(isExist(key),
fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
key.c_str(),
vec2str(keys()).c_str()));
return tensor_map_.at(key).getVal<T>();
}
template<typename T>
inline T getVal(const std::string& key, T default_value) const
{
if (isExist(key)) {
return tensor_map_.at(key).getVal<T>();
}
return default_value;
}
template<typename T>
inline T getValWithOffset(const std::string& key, size_t index) const
{
FT_CHECK_WITH_INFO(isExist(key),
fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
key.c_str(),
vec2str(keys()).c_str()));
return tensor_map_.at(key).getVal<T>(index);
}
template<typename T>
inline T getValWithOffset(const std::string& key, size_t index, T default_value) const
{
if (isExist(key)) {
return tensor_map_.at(key).getVal<T>(index);
}
return default_value;
}
template<typename T>
inline T* getPtr(const std::string& key) const
{
FT_CHECK_WITH_INFO(isExist(key),
fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
key.c_str(),
vec2str(keys()).c_str()));
return tensor_map_.at(key).getPtr<T>();
}
template<typename T>
inline T* getPtr(const std::string& key, T* default_ptr) const
{
if (isExist(key)) {
return tensor_map_.at(key).getPtr<T>();
}
return default_ptr;
}
template<typename T>
inline T* getPtrWithOffset(const std::string& key, size_t index) const
{
FT_CHECK_WITH_INFO(isExist(key),
fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
key.c_str(),
vec2str(keys()).c_str()));
return tensor_map_.at(key).getPtrWithOffset<T>(index);
}
template<typename T>
inline T* getPtrWithOffset(const std::string& key, size_t index, T* default_ptr) const
{
if (isExist(key)) {
return tensor_map_.at(key).getPtrWithOffset<T>(index);
}
return default_ptr;
}
inline std::unordered_map<std::string, Tensor> getMap() const
{
return tensor_map_;
}
inline std::unordered_map<std::string, Tensor>::iterator begin()
{
return tensor_map_.begin();
}
inline std::unordered_map<std::string, Tensor>::iterator end()
{
return tensor_map_.end();
}
inline std::unordered_map<std::string, Tensor>& get()
{
return tensor_map_;
}
inline std::unordered_map<std::string, Tensor>::const_iterator begin() const
{
return tensor_map_.begin();
}
inline std::unordered_map<std::string, Tensor>::const_iterator end() const
{
return tensor_map_.end();
}
std::string toString();
static TensorMap fromNpyFolder(const std::string& base_folder);
void saveNpy(const std::string& base_folder);
};
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/utils/cuda_utils.h"
namespace fastertransformer {
enum class ActivationType {
Gelu,
Relu,
Silu,
GeGLU,
ReGLU,
SiGLU,
Identity,
InvalidType
};
inline ActivationType getActivationType(std::string activation_type_str)
{
if (activation_type_str == "Gelu" || activation_type_str == "gelu") {
return ActivationType::Gelu;
}
else if (activation_type_str == "Relu" || activation_type_str == "relu") {
return ActivationType::Relu;
}
else if (activation_type_str == "Silu" || activation_type_str == "silu") {
return ActivationType::Silu;
}
else if (activation_type_str == "GeGLU" || activation_type_str == "geglu" || activation_type_str == "gated-gelu") {
return ActivationType::GeGLU;
}
else if (activation_type_str == "ReGLU" || activation_type_str == "reglu" || activation_type_str == "gated-relu") {
return ActivationType::ReGLU;
}
else if (activation_type_str == "SiGLU" || activation_type_str == "gated-silu") {
return ActivationType::SiGLU;
}
else {
FT_CHECK_WITH_INFO(false, "Activation Type: " + activation_type_str + " not supported !");
}
return ActivationType::InvalidType;
}
inline bool isGatedActivation(ActivationType activaiton_type)
{
return activaiton_type == ActivationType::GeGLU || activaiton_type == ActivationType::ReGLU
|| activaiton_type == ActivationType::SiGLU;
}
} // namespace fastertransformer
\ No newline at end of file
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Memory Allocator
**/
#pragma once
#include "cuda_utils.h"
#include <cuda_runtime.h>
#include <unordered_map>
#include <vector>
#ifdef GOOGLE_CUDA
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/shape_inference.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/framework/types.h"
#include "tensorflow/core/lib/core/errors.h"
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#endif
#ifdef TORCH_CUDA
#include "torch/extension.h"
#include <memory>
#endif
#include "src/fastertransformer/utils/logger.h"
#if defined(CUDART_VERSION) && CUDART_VERSION < 11020
#define CUDA_MEMORY_POOL_DISABLED
#endif
namespace fastertransformer {
enum class AllocatorType {
CUDA,
TF,
TH
};
enum class ReallocType {
INCREASE,
REUSE,
DECREASE,
};
class IAllocator {
public:
virtual ~IAllocator(){};
virtual void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false) = 0;
virtual void free(void** ptr, bool is_host = false) const = 0;
virtual void setStream(cudaStream_t stream) = 0;
virtual cudaStream_t returnStream() = 0;
virtual void memSet(void* ptr, const int val, const size_t size) = 0;
template<typename T>
void* reMalloc(T* ptr, size_t size, const bool is_set_zero = true, bool is_host = false)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
size = ((size + 31) / 32) * 32; // make the buffer align with 32 bytes
void* void_ptr = (void*)ptr;
void* ptr_address = getAddress(void_ptr);
if (isExist(ptr_address)) {
ReallocType realloc_type = isReMalloc(ptr_address, size);
if (realloc_type == ReallocType::INCREASE) {
FT_LOG_DEBUG("ReMalloc the buffer %p since it is too small.", void_ptr);
free((void**)(&void_ptr), is_host);
return malloc(size, is_set_zero, is_host);
}
#if !defined(CUDA_MEMORY_POOL_DISABLED)
else if (realloc_type == ReallocType::DECREASE) {
FT_LOG_DEBUG("ReMalloc the buffer %p to release unused memory to memory pools.", void_ptr);
free((void**)(&void_ptr), is_host);
return malloc(size, is_set_zero, is_host);
}
#endif
else {
FT_LOG_DEBUG("Reuse original buffer %p with size %d and do nothing for reMalloc.", void_ptr, size);
if (is_set_zero) {
memSet(void_ptr, 0, size);
}
return void_ptr;
}
}
else {
FT_LOG_DEBUG("Cannot find buffer %p, mallocing new one.", void_ptr);
return malloc(size, is_set_zero, is_host);
}
}
protected:
virtual bool isExist(void* address) const = 0;
virtual ReallocType isReMalloc(void* address, size_t size) const = 0;
void* getAddress(void* ptr) const
{
return ptr;
}
};
template<AllocatorType AllocType_>
class Allocator;
template<>
class Allocator<AllocatorType::CUDA>: public IAllocator {
private:
const int device_id_;
cudaStream_t stream_ = 0; // initialize as default stream
std::unordered_map<void*, size_t>* pointer_mapping_;
bool isExist(void* address) const
{
return pointer_mapping_->count(address) > 0;
}
ReallocType isReMalloc(void* address, size_t size) const
{
FT_CHECK(isExist(address));
if (pointer_mapping_->at(address) < size) {
return ReallocType::INCREASE;
}
else if (pointer_mapping_->at(address) == size) {
return ReallocType::REUSE;
}
else {
return ReallocType::DECREASE;
}
}
public:
Allocator(int device_id): device_id_(device_id)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
pointer_mapping_ = new std::unordered_map<void*, size_t>();
#if defined(CUDA_MEMORY_POOL_DISABLED)
FT_LOG_WARNING(
"Async cudaMalloc/Free is not supported before CUDA 11.2. Using Sync cudaMalloc/Free."
"Note this may lead to hang with NCCL kernels launched in parallel; if so, try NCCL_LAUNCH_MODE=GROUP");
#else
int device_count = 1;
check_cuda_error(cudaGetDeviceCount(&device_count));
cudaMemPool_t mempool;
check_cuda_error(cudaDeviceGetDefaultMemPool(&mempool, device_id));
cudaMemAccessDesc desc = {};
int peer_access_available = 0;
for (int i = 0; i < device_count; i++) {
if (i == device_id) {
continue;
}
check_cuda_error(cudaDeviceCanAccessPeer(&peer_access_available, device_id, i));
if (!peer_access_available) {
FT_LOG_WARNING("Device " + std::to_string(device_id) + " peer access Device " + std::to_string(i)
+ " is not available.");
continue;
}
desc.location.type = cudaMemLocationTypeDevice;
desc.location.id = i;
desc.flags = cudaMemAccessFlagsProtReadWrite;
check_cuda_error(cudaMemPoolSetAccess(mempool, &desc, 1));
}
// set memory pool threshold to avoid shrinking the pool
uint64_t setVal = UINT64_MAX;
check_cuda_error(cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &setVal));
#endif
}
virtual ~Allocator()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
while (!pointer_mapping_->empty()) {
free((void**)(&pointer_mapping_->begin()->first));
}
delete pointer_mapping_;
}
void setStream(cudaStream_t stream)
{
stream_ = stream;
}
cudaStream_t returnStream()
{
return stream_;
};
void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
if (size == 0) {
return nullptr;
}
void* ptr = nullptr;
int o_device = 0;
check_cuda_error(getSetDevice(device_id_, &o_device));
if (is_host) {
check_cuda_error(cudaMallocHost(&ptr, (size_t)(ceil(size / 32.)) * 32));
}
else {
#if defined(CUDA_MEMORY_POOL_DISABLED)
check_cuda_error(cudaMalloc(&ptr, (size_t)(ceil(size / 32.)) * 32));
#else
check_cuda_error(cudaMallocAsync(&ptr, (size_t)(ceil(size / 32.)) * 32, stream_));
#endif
}
if (is_set_zero) {
check_cuda_error(cudaMemsetAsync(ptr, 0, (size_t)(ceil(size / 32.)) * 32, stream_));
}
check_cuda_error(getSetDevice(o_device));
FT_LOG_DEBUG("malloc buffer %p with size %ld", ptr, size);
pointer_mapping_->insert({getAddress(ptr), size});
return ptr;
}
void free(void** ptr, bool is_host = false) const
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
void* address = getAddress(*ptr);
if (*ptr != nullptr) {
int o_device = 0;
if (pointer_mapping_->count(address)) {
FT_LOG_DEBUG("Free buffer %p", address);
check_cuda_error(getSetDevice(device_id_, &o_device));
if (is_host) {
check_cuda_error(cudaFreeHost(*ptr));
}
else {
#if defined(CUDA_MEMORY_POOL_DISABLED)
check_cuda_error(cudaFree(*ptr));
#else
check_cuda_error(cudaFreeAsync(*ptr, stream_));
cudaStreamSynchronize(stream_);
#endif
}
check_cuda_error(getSetDevice(o_device));
pointer_mapping_->erase(address);
}
else {
FT_LOG_WARNING("pointer_mapping_ does not have information of ptr at %p.", address);
}
}
*ptr = nullptr;
return;
}
void memSet(void* ptr, const int val, const size_t size)
{
check_cuda_error(cudaMemsetAsync(ptr, val, size, stream_));
}
};
#ifdef GOOGLE_CUDA
using namespace tensorflow;
template<>
class Allocator<AllocatorType::TF>: public IAllocator {
OpKernelContext* context_;
std::unordered_map<void*, tensorflow::Tensor>* pointer_mapping_;
cudaStream_t stream_;
bool isExist(void* address) const
{
return pointer_mapping_->count(address) > 0;
}
ReallocType isReMalloc(void* address, size_t size) const
{
FT_CHECK(isExist(address));
size_t current_buffer_size = 1;
for (int i = 0; i < pointer_mapping_->at(address).dims(); i++) {
current_buffer_size *= pointer_mapping_->at(address).dim_size(i);
}
FT_LOG_DEBUG("current_buffer_size: %d, new buffer: %d", current_buffer_size, size);
if (current_buffer_size < size) {
return ReallocType::INCREASE;
}
else if (current_buffer_size == size) {
return ReallocType::REUSE;
}
else {
return ReallocType::DECREASE;
}
}
public:
Allocator(OpKernelContext* context, cudaStream_t stream): context_(context), stream_(stream)
{
pointer_mapping_ = new std::unordered_map<void*, tensorflow::Tensor>();
}
void setStream(cudaStream_t stream)
{
stream_ = stream;
}
cudaStream_t returnStream()
{
return stream_;
};
void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
tensorflow::Tensor buf;
long long int buf_size = ((long long int)ceil(size / 32.) * 32);
tensorflow::Status status;
if (is_host) {
tensorflow::AllocatorAttributes pinned_allocator;
pinned_allocator.set_on_host(true);
pinned_allocator.set_gpu_compatible(true);
status = context_->allocate_temp(DT_UINT8, TensorShape{buf_size}, &buf, pinned_allocator);
}
else {
status = context_->allocate_temp(DT_UINT8, TensorShape{buf_size}, &buf);
}
if (status != tensorflow::Status::OK()) {
throw std::runtime_error("TF error: context->allocate_temp failed");
}
auto flat = buf.flat<uint8>();
void* ptr = (void*)flat.data();
if (is_set_zero) {
cudaMemsetAsync(ptr, 0, buf_size, stream_);
}
pointer_mapping_->insert({getAddress(ptr), buf});
return ptr;
}
void free(void** ptr, bool is_host = false) const
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
void* address = getAddress(*ptr);
pointer_mapping_->erase(address);
*ptr = nullptr;
return;
}
virtual ~Allocator()
{
while (!pointer_mapping_->empty()) {
void* ptr = pointer_mapping_->begin()->second.flat<uint8>().data();
free((void**)(&ptr));
}
pointer_mapping_->clear();
delete pointer_mapping_;
}
void memSet(void* ptr, const int val, const size_t size)
{
check_cuda_error(cudaMemsetAsync(ptr, val, size, stream_));
}
};
#endif
#ifdef TORCH_CUDA
template<>
class Allocator<AllocatorType::TH>: public IAllocator {
std::unordered_map<void*, torch::Tensor>* pointer_mapping_;
bool isExist(void* address) const
{
return pointer_mapping_->count(address) > 0;
}
ReallocType isReMalloc(void* address, size_t size) const
{
FT_CHECK(isExist(address));
size_t current_buffer_size = 1;
for (int i = 0; i < pointer_mapping_->at(address).dim(); i++) {
current_buffer_size *= pointer_mapping_->at(address).size(i);
}
FT_LOG_DEBUG(
"current_buffer_size: %d, original buffer: %p, new buffer: %d", current_buffer_size, address, size);
if (current_buffer_size < size) {
return ReallocType::INCREASE;
}
else if (current_buffer_size == size) {
return ReallocType::REUSE;
}
else {
return ReallocType::DECREASE;
}
}
public:
Allocator()
{
pointer_mapping_ = new std::unordered_map<void*, torch::Tensor>();
}
void setStream(cudaStream_t stream)
{
// nothing to do here;
}
cudaStream_t returnStream()
{
// nothing to do here;
return 0;
};
void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
int64_t buf_size = static_cast<int64_t>(ceil(size / 32.)) * 32;
torch::Tensor buf;
if (is_host) {
buf = torch::empty({buf_size}, torch::dtype(torch::kUInt8).device(torch::kCPU).pinned_memory(true));
}
else {
buf = torch::empty({buf_size}, torch::dtype(torch::kUInt8).device(torch::kCUDA));
}
void* ptr = buf.data_ptr();
if (is_set_zero) {
cudaMemset(ptr, 0, buf_size);
}
FT_LOG_DEBUG("malloc buffer %p with size %ld", ptr, buf_size);
pointer_mapping_->insert({getAddress(ptr), buf});
return ptr;
}
void free(void** ptr, bool is_host = false) const
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
void* address = getAddress(*ptr);
pointer_mapping_->erase(address);
*ptr = nullptr;
return;
}
virtual ~Allocator()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
while (!pointer_mapping_->empty()) {
void* ptr = pointer_mapping_->begin()->second.data_ptr();
free((void**)(&ptr));
}
pointer_mapping_->clear();
delete pointer_mapping_;
}
void memSet(void* ptr, const int val, const size_t size)
{
check_cuda_error(cudaMemset(ptr, val, size));
}
};
#endif
} // namespace fastertransformer
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "cublasLt.h"
#include "cuda_utils.h"
#include "math.h"
#include "stdio.h"
#include "stdlib.h"
#include <cublas_v2.h>
#include <cuda_fp16.h>
#include <cudnn.h>
namespace fastertransformer {
template<typename T>
void conv2d(T* output,
const T* input,
const T* kernel,
const int batch,
const int h,
const int w,
const int in_channels,
const int out_channels,
const int kernel_size,
const int stride,
cudnnHandle_t& cudnn_handle)
{
cudnnDataType_t dataType;
cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
float alpha = 1.0f;
float beta = 0.0f;
if (std::is_same<T, half>::value) {
dataType = CUDNN_DATA_HALF;
}
#ifdef ENABLE_BF16
else if (std::is_same<T, __nv_bfloat16>::value) {
dataType = CUDNN_DATA_BFLOAT16;
}
#endif
else {
dataType = CUDNN_DATA_FLOAT;
}
cudnnTensorDescriptor_t input_descriptor_;
cudnnTensorDescriptor_t output_descriptor_;
cudnnFilterDescriptor_t kernel_descriptor_;
cudnnConvolutionDescriptor_t convolution_descriptor_;
cudnnConvolutionFwdAlgo_t convolution_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
// cudnnConvolutionFwdAlgo_t convolution_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
// cudnnConvolutionFwdAlgo_t convolution_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_GEMM;
// cudnnConvolutionFwdAlgo_t convolution_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_DIRECT;
// cudnnConvolutionFwdAlgo_t convolution_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING;
// cudnnConvolutionFwdAlgo_t convolution_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_FFT;
// cudnnConvolutionFwdAlgo_t convolution_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD;
// cudnnConvolutionFwdAlgo_t convolution_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
checkCUDNN(cudnnCreateTensorDescriptor(&input_descriptor_));
checkCUDNN(cudnnSetTensor4dDescriptor(input_descriptor_,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/dataType,
/*batch_size=*/batch,
/*channels=*/in_channels,
/*image_height=*/h,
/*image_width=*/w));
checkCUDNN(cudnnCreateTensorDescriptor(&output_descriptor_));
checkCUDNN(cudnnSetTensor4dDescriptor(output_descriptor_,
/*format=*/CUDNN_TENSOR_NHWC,
/*dataType=*/dataType,
/*batch_size=*/batch,
/*channels=*/out_channels,
/*image_height=*/h / stride,
/*image_width=*/w / stride));
checkCUDNN(cudnnCreateFilterDescriptor(&kernel_descriptor_));
checkCUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor_,
/*dataType=*/dataType,
/*format=*/CUDNN_TENSOR_NCHW,
/*out_channels=*/out_channels,
/*in_channels=*/in_channels,
/*kernel_height=*/kernel_size,
/*kernel_width=*/kernel_size));
checkCUDNN(cudnnCreateConvolutionDescriptor(&convolution_descriptor_));
checkCUDNN(cudnnSetConvolution2dDescriptor(convolution_descriptor_,
/*pad_height=*/0,
/*pad_width=*/0,
/*vertical_stride=*/stride,
/*horizontal_stride=*/stride,
/*dilation_height=*/1,
/*dilation_width=*/1,
/*mode=*//*CUDNN_CONVOLUTION,*/ CUDNN_CROSS_CORRELATION,
/*computeType=*/computeType));
/*checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnn_handle,
input_descriptor_,
kernel_descriptor_,
convolution_descriptor_,
output_descriptor_,
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
0,//memoryLimitInBytes
&convolution_algorithm_));*/
checkCUDNN(cudnnConvolutionForward(cudnn_handle,
&alpha,
input_descriptor_,
input,
kernel_descriptor_,
kernel,
convolution_descriptor_,
convolution_algorithm_,
nullptr,
0,
&beta,
output_descriptor_,
output));
checkCUDNN(cudnnDestroyTensorDescriptor(input_descriptor_));
checkCUDNN(cudnnDestroyTensorDescriptor(output_descriptor_));
checkCUDNN(cudnnDestroyFilterDescriptor(kernel_descriptor_));
checkCUDNN(cudnnDestroyConvolutionDescriptor(convolution_descriptor_));
}
} // namespace fastertransformer
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "stdio.h"
#include "stdlib.h"
// be consistent with FasterTransformer
int8_t float_to_int8_rn_host(float x)
{
int8_t res;
int32_t tmp;
if (x >= 0) {
tmp = int(x + 0.5);
tmp = tmp > 127 ? 127 : tmp;
res = int8_t(tmp);
}
else {
tmp = int(x - 0.5);
tmp = tmp < -127 ? -127 : tmp;
res = int8_t(tmp);
}
return res;
}
\ No newline at end of file
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "cublasAlgoMap.h"
namespace fastertransformer {
cublasAlgoMap::cublasAlgoMap(const std::string filename, const std::string sp_config_filename):
config_filename_(filename), sp_config_filename_(sp_config_filename)
{
loadGemmConfig();
loadSpGemmConfig();
}
cublasAlgoMap::cublasAlgoMap(const cublasAlgoMap& algo_map):
config_filename_(algo_map.config_filename_),
sp_config_filename_(algo_map.sp_config_filename_),
algo_map_(algo_map.algo_map_),
sp_algo_map_(algo_map.sp_algo_map_)
{
}
cublasAlgoMap::~cublasAlgoMap()
{
algo_map_.clear();
}
void cublasAlgoMap::loadGemmConfig()
{
FILE* fd;
fd = fopen(config_filename_.c_str(), "r");
if (fd == NULL) {
std::cout << "[WARNING] " << config_filename_ << " is not found; using default GEMM algo" << std::endl;
return;
}
int batchCount2, m2, n2, k2, algoId, customOption, tile, splitK_val;
int batch_size, seq_len, head_num, size_per_head, dataType;
int swizzle, reductionScheme, workspaceSize, stages;
int inner_shapeId, cluster_shapeId, mma_shapeId, cga_shapeId, sche_mode;
float exec_time;
char tmp[1024];
if (!fgets(tmp, 1024, fd)) {
printf("[ERROR] fgets fail at %s:%d \n", __FILE__, __LINE__);
exit(-1);
}
while (fscanf(fd,
"%d %d %d %d %d ### %d %d %d %d %d %d %d %d %d %d %d %d "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
"%d %d "
#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
"%d %d %d "
#endif
"%f\n",
&batch_size,
&seq_len,
&head_num,
&size_per_head,
&dataType,
&batchCount2,
&n2,
&m2,
&k2,
&algoId,
&customOption,
&tile,
&splitK_val,
&swizzle,
&reductionScheme,
&workspaceSize,
&stages,
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
&inner_shapeId,
&cluster_shapeId,
#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
&mma_shapeId,
&cga_shapeId,
&sche_mode,
#endif
&exec_time)
!= EOF) {
if (dataType != FLOAT_DATATYPE && dataType != HALF_DATATYPE && dataType != BFLOAT16_DATATYPE
&& dataType != INT8_DATATYPE && dataType != FP8_DATATYPE) {
printf("[WARNING][readAlgoFromConfig] wrong dataType %d!\n", dataType);
continue;
}
cublasAlgoConfig_t markStr{batchCount2, m2, n2, k2, static_cast<CublasDataType>(dataType)};
// workspaceSize should be zero
if (algo_map_.find(markStr) == algo_map_.end()) {
algo_map_[markStr].algoId = algoId;
algo_map_[markStr].customOption = customOption;
algo_map_[markStr].tile = tile;
algo_map_[markStr].splitK_val = splitK_val;
algo_map_[markStr].swizzle = swizzle;
algo_map_[markStr].reductionScheme = reductionScheme;
algo_map_[markStr].workspaceSize = workspaceSize;
algo_map_[markStr].stages = stages;
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
algo_map_[markStr].inner_shapeId = (uint16_t)inner_shapeId;
algo_map_[markStr].cluster_shapeId = (uint16_t)cluster_shapeId;
#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
algo_map_[markStr].mma_shapeId = (uint16_t)mma_shapeId;
algo_map_[markStr].cga_shapeId = (uint16_t)cga_shapeId;
algo_map_[markStr].sche_mode = (uint16_t)sche_mode;
#endif
algo_map_[markStr].exec_time = exec_time;
}
}
fclose(fd);
}
bool cublasAlgoMap::isExist(
const int batch_count, const int m, const int n, const int k, const CublasDataType data_type)
{
cublasAlgoConfig_t mark{batch_count, n, m, k, data_type};
return algo_map_.find(mark) != algo_map_.end();
}
cublasLtMatmulAlgo_info
cublasAlgoMap::getAlgo(const int batch_count, const int m, const int n, const int k, const CublasDataType data_type)
{
cublasAlgoConfig_t mark{batch_count, n, m, k, data_type};
if (algo_map_.find(mark) != algo_map_.end()) {
return algo_map_[mark];
}
else {
cublasLtMatmulAlgo_info tmp_algo;
tmp_algo.algoId =
static_cast<int>(data_type == FLOAT_DATATYPE ? CUBLAS_GEMM_DEFAULT : CUBLAS_GEMM_DEFAULT_TENSOR_OP);
tmp_algo.customOption = -1;
tmp_algo.tile = -1;
tmp_algo.splitK_val = -1;
tmp_algo.swizzle = -1;
tmp_algo.reductionScheme = -1;
tmp_algo.workspaceSize = -1;
tmp_algo.stages = -1;
tmp_algo.exec_time = -1.0f;
return tmp_algo;
}
}
void cublasAlgoMap::loadSpGemmConfig()
{
if (sp_config_filename_.empty()) {
return;
}
FILE* fd = fopen(sp_config_filename_.c_str(), "r");
if (fd == NULL) {
printf("[WARNING] %s is not found; using SPGEMM algo id 0\n", sp_config_filename_.c_str());
return;
}
sp_algo_map_.clear();
int batch_size, seq_len, head_num, size_per_head, data_type;
int batchCount, m, n, k, algoId;
float exec_time;
char tmp[1024];
if (!fgets(tmp, 1024, fd)) {
printf("[ERROR] fgets fail at %s:%d \n", __FILE__, __LINE__);
exit(-1);
}
while (fscanf(fd,
"%d %d %d %d %d ### %d %d %d %d %d %f\n",
&batch_size,
&seq_len,
&head_num,
&size_per_head,
&data_type,
&batchCount,
&m,
&n,
&k,
&algoId,
&exec_time)
!= EOF) {
char mark[256];
sprintf(mark, "%d_%d_%d_%d", batchCount, m, n, k);
std::string markStr(mark);
sp_algo_map_[markStr] = algoId;
}
fclose(fd);
}
int cublasAlgoMap::getSpAlgo(const int batch_count, const int m, const int n, const int k)
{
char mark[256];
sprintf(mark, "%d_%d_%d_%d", batch_count, m, n, k);
if (sp_algo_map_.find(mark) != sp_algo_map_.end()) {
return sp_algo_map_[mark];
}
else {
// for remove padding, select algo 1 for simplicity
return 0;
}
}
bool cublasAlgoMap::isUseSparse(const int batch_count, const int m, const int n, const int k)
{
// not available to use cusparselt.
if (m % 8 != 0 || n % 8 != 0 || k % 8 != 0) {
return false;
}
char mark[256];
sprintf(mark, "%d_%d_%d_%d", batch_count, m, n, k);
if (sp_algo_map_.find(mark) != sp_algo_map_.end()) {
return sp_algo_map_[mark] != -1;
}
else {
// no gemm test case, choose sparse according to sparse flag
return true;
}
}
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/fastertransformer/utils/cuda_utils.h"
#include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <map>
#include <string>
#include <unordered_map>
#include <utility>
#pragma once
namespace fastertransformer {
#define GEMM_NUM 6
#define GEMM_CONFIG "gemm_config.in"
#define IGEMM_CONFIG "igemm_config.in"
#define SPGEMM_CONFIG "spgemm_config.in"
#define SPIGEMM_CONFIG "spigemm_config.in"
typedef struct {
int algoId, customOption, tile, splitK_val;
int swizzle, reductionScheme, workspaceSize;
// only used in cublasLt >= 11.0
int stages;
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
uint16_t inner_shapeId, cluster_shapeId;
#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
uint16_t mma_shapeId, cga_shapeId, sche_mode;
#endif
float exec_time;
} cublasLtMatmulAlgo_info;
/* Structure to store information about different run trials */
typedef struct {
cublasLtMatmulAlgo_t algo;
cublasStatus_t status;
float time;
size_t workspaceSize; // actual memory workspace needed
cublasMath_t mathMode;
cublasLtReductionScheme_t reductionScheme;
int customOption;
float wavesCount;
} customMatmulPerf_t;
struct cublasAlgoConfig_t {
int batch_count;
int m;
int n;
int k;
CublasDataType data_type;
bool operator==(cublasAlgoConfig_t const& config) const
{
return (batch_count == config.batch_count) && (m == config.m) && (n == config.n) && (k == config.k)
&& (data_type == config.data_type);
}
};
class cublasAlgoConfig_hasher {
public:
std::size_t operator()(cublasAlgoConfig_t const& config) const
{
return config.batch_count * 98317ull ^ config.m * 49157ull ^ config.n * 24593ull ^ config.k * 196613ull
^ static_cast<int>(config.data_type) * 6151ull;
}
};
class cublasAlgoMap {
private:
std::unordered_map<cublasAlgoConfig_t, cublasLtMatmulAlgo_info, cublasAlgoConfig_hasher> algo_map_;
std::string config_filename_;
std::string sp_config_filename_;
std::map<std::string, int> sp_algo_map_;
public:
cublasAlgoMap(){};
explicit cublasAlgoMap(const std::string filename, const std::string sp_config_filename = "");
cublasAlgoMap(const cublasAlgoMap& map);
~cublasAlgoMap();
void loadGemmConfig();
void loadSpGemmConfig();
int getSpAlgo(const int batch_count, const int m, const int n, const int k);
bool isUseSparse(const int batch_count, const int m, const int n, const int k);
bool isExist(const int batch_count, const int m, const int n, const int k, const CublasDataType data_type);
cublasLtMatmulAlgo_info
getAlgo(const int batch_count, const int m, const int n, const int k, const CublasDataType data_type);
};
} // namespace fastertransformer
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "cublasFP8MMWrapper.h"
#include "cuda_utils.h"
namespace fastertransformer {
#define CUBLAS_WORKSPACE_1MB 1048576
cublasFP8MMWrapper::cublasFP8MMWrapper(cublasLtHandle_t cublaslt_handle,
cudaStream_t stream,
cublasAlgoMap* cublas_algo_map,
std::mutex* mu,
IAllocator* allocator):
cublasMMWrapper(nullptr, cublaslt_handle, stream, cublas_algo_map, mu, allocator)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
FT_CHECK_WITH_INFO(allocator != nullptr, "must pass allocator to cublasFP8MMWrapper");
cublasVersionCheck();
if (allocator_ != nullptr) {
cublas_workspace_qgemm_ = allocator_->reMalloc(cublas_workspace_qgemm_, CUBLAS_WORKSPACE_1MB, true);
}
}
cublasFP8MMWrapper::cublasFP8MMWrapper(cublasHandle_t cublas_handle,
cublasLtHandle_t cublaslt_handle,
cudaStream_t stream,
cublasAlgoMap* cublas_algo_map,
std::mutex* mu,
IAllocator* allocator):
cublasMMWrapper(cublas_handle, cublaslt_handle, stream, cublas_algo_map, mu, allocator)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
FT_CHECK_WITH_INFO(allocator != nullptr, "must pass allocator to cublasFP8MMWrapper");
cublasVersionCheck();
if (allocator_ != nullptr) {
cublas_workspace_qgemm_ = allocator_->reMalloc(cublas_workspace_qgemm_, CUBLAS_WORKSPACE_1MB, true);
}
}
cublasFP8MMWrapper::~cublasFP8MMWrapper()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
mu_ = nullptr;
if (allocator_ != nullptr) {
allocator_->free((void**)(&cublas_workspace_qgemm_));
}
}
cublasFP8MMWrapper::cublasFP8MMWrapper(const cublasFP8MMWrapper& wrapper):
cublasMMWrapper(wrapper.cublas_handle_,
wrapper.cublaslt_handle_,
wrapper.stream_,
wrapper.cublas_algo_map_,
wrapper.mu_,
wrapper.allocator_)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
cublasVersionCheck();
}
void cublasFP8MMWrapper::cublasVersionCheck()
{
cublasGetProperty(MAJOR_VERSION, &version_major_);
cublasGetProperty(MINOR_VERSION, &version_minor_);
cublasGetProperty(PATCH_LEVEL, &version_patch_);
size_t cublasVersion = (version_major_ * 10000 + version_minor_ * 100 + version_patch_);
#if defined(FP8_MHA) || !defined(FP8_GEMM_OUTPUT_QUANT_DISABLE)
FT_CHECK_WITH_INFO((version_major_ > 11) || (version_major_ == 11 && version_minor_ == 11 && version_patch_ >= 4),
"FP8 MHA needs d-scale, which is only supported after cublas 11.11.4 !");
#endif
}
void cublasFP8MMWrapper::Gemm(__nv_bfloat16* res,
int batchCount,
int m,
int n,
int k,
int64_t strideA,
int64_t strideB,
int64_t strideD,
const float* alpha,
const float* beta,
const __nv_fp8_e4m3* input,
const __nv_fp8_e4m3* kernel,
const float* input_scale,
const float* kernel_scale)
{
Gemm(res,
batchCount,
m,
n,
k,
strideA,
strideB,
strideD,
alpha,
beta,
input,
kernel,
input_scale,
kernel_scale,
(cudaStream_t)0);
}
void cublasFP8MMWrapper::Gemm(__nv_bfloat16* res,
int batchCount,
int m,
int n,
int k,
int64_t strideA,
int64_t strideB,
int64_t strideD,
const float* alpha,
const float* beta,
const __nv_fp8_e4m3* input,
const __nv_fp8_e4m3* kernel,
const float* input_scale,
const float* kernel_scale,
cudaStream_t stream,
bool fastAccum)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
mu_->lock();
const void* devAscalePtr = (const void*)kernel_scale;
const void* devBscalePtr = (const void*)input_scale;
const size_t wsSizeBytes = CUBLAS_WORKSPACE_SIZE;
const auto aType = CUDA_R_8F_E4M3;
const auto bType = CUDA_R_8F_E4M3;
const auto dType = CUDA_R_16BF;
const auto computeType = CUBLAS_COMPUTE_32F;
const auto scaleType = CUDA_R_32F;
// const auto epilogueAuxType = CUDA_R_16BF;
const cublasOperation_t tA = CUBLAS_OP_T;
const cublasOperation_t tB = CUBLAS_OP_N;
//------- init, desc & tensors
cublasLtMatmulDesc_t matmulDesc;
cublasLtMatrixLayout_t Adesc;
cublasLtMatrixLayout_t Bdesc;
cublasLtMatrixLayout_t Ddesc;
{
check_cuda_error(cublasLtMatmulDescCreate(&matmulDesc, computeType, scaleType));
check_cuda_error(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &tA, sizeof(tA)));
check_cuda_error(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &tB, sizeof(tB)));
if (version_major_ >= 11 && version_minor_ >= 11 && version_patch_ > 0 && fastAccum) {
const int8_t fastAccuMode = 1; // enable fast imprecise accum
check_cuda_error(cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(decltype(fastAccuMode))));
}
// TODO: Check that do we need to set these attributes
// TODO: comment them for compiler first
check_cuda_error(cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &devAscalePtr, sizeof(devAscalePtr)));
check_cuda_error(cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &devBscalePtr, sizeof(devBscalePtr)));
}
{
const int64_t lda = k;
const int64_t ldb = k;
const int64_t ldd = n;
// create matrix descriptors, we are good with the details here so no need
// to set any extra attributes
check_cuda_error(
cublasLtMatrixLayoutCreate(&Adesc, aType, tA == CUBLAS_OP_N ? n : k, tA == CUBLAS_OP_N ? k : n, lda));
if (batchCount > 1) {
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideA, sizeof(strideA)));
}
check_cuda_error(
cublasLtMatrixLayoutCreate(&Bdesc, bType, tB == CUBLAS_OP_N ? k : m, tB == CUBLAS_OP_N ? m : k, ldb));
if (batchCount > 1) {
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideB, sizeof(strideB)));
}
check_cuda_error(cublasLtMatrixLayoutCreate(&Ddesc, dType, n, m, ldd));
if (batchCount > 1) {
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Ddesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Ddesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
}
}
bool findAlgo = cublas_algo_map_->isExist(batchCount, n, m, k, FP8_DATATYPE);
cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batchCount, n, m, k, FP8_DATATYPE);
if (info.stages == -1) {
findAlgo = false;
}
cublasLtMatmulAlgo_t algo;
int workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
if (findAlgo) {
if (info.workspaceSize > workspaceSize) {
findAlgo = false;
}
else {
cublasLtMatmulAlgoInit(
cublaslt_handle_, computeType, scaleType, aType, bType, dType, dType, info.algoId, &algo);
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption));
cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(info.reductionScheme), sizeof(info.reductionScheme));
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
#endif
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID, &(info.inner_shapeId), sizeof(info.inner_shapeId));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID, &(info.cluster_shapeId), sizeof(info.cluster_shapeId));
#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_MMA_SHAPE_ID, &(info.mma_shapeId), sizeof(info.mma_shapeId));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CGA_SHAPE_ID, &(info.cga_shapeId), sizeof(info.cga_shapeId));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_SCHEDULING_MODE, &(info.sche_mode), sizeof(info.sche_mode));
#endif
}
}
{
cublasStatus_t status = cublasLtMatmul(cublaslt_handle_,
matmulDesc,
alpha,
kernel,
Adesc,
input,
Bdesc,
beta,
nullptr, // Cptr, not used here
Ddesc,
res,
Ddesc,
(findAlgo ? (&algo) : NULL),
cublas_workspace_,
wsSizeBytes,
stream);
check_cuda_error(status);
}
if (Ddesc) {
check_cuda_error(cublasLtMatrixLayoutDestroy(Ddesc));
}
if (Bdesc) {
check_cuda_error(cublasLtMatrixLayoutDestroy(Bdesc));
}
if (Adesc) {
check_cuda_error(cublasLtMatrixLayoutDestroy(Adesc));
}
if (matmulDesc) {
check_cuda_error(cublasLtMatmulDescDestroy(matmulDesc));
}
mu_->unlock();
}
void cublasFP8MMWrapper::Gemm(__nv_fp8_e4m3* res,
int batchCount,
int m,
int n,
int k,
int64_t strideA,
int64_t strideB,
int64_t strideD,
const float* alpha,
const float* beta,
const __nv_fp8_e4m3* input,
const __nv_fp8_e4m3* kernel,
const float* input_scale,
const float* kernel_scale,
const float* output_scale)
{
Gemm(res,
batchCount,
m,
n,
k,
strideA,
strideB,
strideD,
alpha,
beta,
input,
kernel,
input_scale,
kernel_scale,
output_scale,
0);
}
void cublasFP8MMWrapper::Gemm(__nv_fp8_e4m3* res,
int batchCount,
int m,
int n,
int k,
int64_t strideA,
int64_t strideB,
int64_t strideD,
const float* alpha,
const float* beta,
const __nv_fp8_e4m3* input,
const __nv_fp8_e4m3* kernel,
const float* input_scale,
const float* kernel_scale,
const float* output_scale,
cudaStream_t stream,
bool fastAccum)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
mu_->lock();
const void* devAscalePtr = (const void*)kernel_scale;
const void* devBscalePtr = (const void*)input_scale;
const void* devDscalePtr = (const void*)output_scale;
FT_CHECK(cublas_workspace_ != nullptr);
const size_t wsSizeBytes = CUBLAS_WORKSPACE_SIZE;
const auto aType = CUDA_R_8F_E4M3;
const auto bType = CUDA_R_8F_E4M3;
const auto cType = CUDA_R_16BF;
const auto dType = CUDA_R_8F_E4M3;
const auto computeType = CUBLAS_COMPUTE_32F;
const auto scaleType = CUDA_R_32F;
const cublasOperation_t tA = CUBLAS_OP_T;
const cublasOperation_t tB = CUBLAS_OP_N;
//------- init, desc & tensors
cublasLtMatmulDesc_t matmulDesc;
cublasLtMatrixLayout_t Adesc;
cublasLtMatrixLayout_t Bdesc;
cublasLtMatrixLayout_t Cdesc;
cublasLtMatrixLayout_t Ddesc;
{
check_cuda_error(cublasLtMatmulDescCreate(&matmulDesc, computeType, scaleType));
check_cuda_error(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &tA, sizeof(tA)));
check_cuda_error(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &tB, sizeof(tB)));
if (version_major_ >= 11 && version_minor_ >= 11 && version_patch_ > 0 && fastAccum) {
const int8_t fastAccuMode = 1; // enable fast imprecise accum
check_cuda_error(cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(decltype(fastAccuMode))));
}
// TODO: Check that do we need to set these attributes
// TODO: comment them for compiler first
check_cuda_error(cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &devAscalePtr, sizeof(devAscalePtr)));
check_cuda_error(cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &devBscalePtr, sizeof(devBscalePtr)));
// check_cuda_error(cublasLtMatmulDescSetAttribute(
// matmulDesc, CUBLASLT_MATMUL_DESC_C_SCALE_POINTER, &devDscalePtr, sizeof(devDscalePtr)));
check_cuda_error(cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, &devDscalePtr, sizeof(devDscalePtr)));
}
{
const int64_t lda = k;
const int64_t ldb = k;
const int64_t ldd = n;
// create matrix descriptors, we are good with the details here so no need
// to set any extra attributes
check_cuda_error(
cublasLtMatrixLayoutCreate(&Adesc, aType, tA == CUBLAS_OP_N ? n : k, tA == CUBLAS_OP_N ? k : n, lda));
if (batchCount > 1) {
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideA, sizeof(strideA)));
}
check_cuda_error(
cublasLtMatrixLayoutCreate(&Bdesc, bType, tB == CUBLAS_OP_N ? k : m, tB == CUBLAS_OP_N ? m : k, ldb));
if (batchCount > 1) {
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideB, sizeof(strideB)));
}
check_cuda_error(cublasLtMatrixLayoutCreate(&Cdesc, cType, n, m, ldd));
if (batchCount > 1) {
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
}
check_cuda_error(cublasLtMatrixLayoutCreate(&Ddesc, dType, n, m, ldd));
if (batchCount > 1) {
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Ddesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Ddesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
}
}
bool findAlgo = cublas_algo_map_->isExist(batchCount, n, m, k, FP8_DATATYPE);
cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batchCount, n, m, k, FP8_DATATYPE);
if (info.stages == -1) {
findAlgo = false;
}
cublasLtMatmulAlgo_t algo;
int workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
if (findAlgo) {
if (info.workspaceSize > workspaceSize) {
findAlgo = false;
}
else {
cublasLtMatmulAlgoInit(
cublaslt_handle_, computeType, scaleType, aType, bType, cType, dType, info.algoId, &algo);
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption));
cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(info.reductionScheme), sizeof(info.reductionScheme));
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
#endif
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID, &(info.inner_shapeId), sizeof(info.inner_shapeId));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID, &(info.cluster_shapeId), sizeof(info.cluster_shapeId));
#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_MMA_SHAPE_ID, &(info.mma_shapeId), sizeof(info.mma_shapeId));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CGA_SHAPE_ID, &(info.cga_shapeId), sizeof(info.cga_shapeId));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_SCHEDULING_MODE, &(info.sche_mode), sizeof(info.sche_mode));
#endif
}
}
{
cublasStatus_t status = cublasLtMatmul(cublaslt_handle_,
matmulDesc,
alpha,
kernel,
Adesc,
input,
Bdesc,
beta,
nullptr, // Cptr, not used here
Cdesc,
res,
Ddesc,
(findAlgo ? (&algo) : NULL),
cublas_workspace_,
wsSizeBytes,
stream);
check_cuda_error(status);
}
if (Ddesc) {
check_cuda_error(cublasLtMatrixLayoutDestroy(Ddesc));
}
if (Cdesc) {
check_cuda_error(cublasLtMatrixLayoutDestroy(Cdesc));
}
if (Bdesc) {
check_cuda_error(cublasLtMatrixLayoutDestroy(Bdesc));
}
if (Adesc) {
check_cuda_error(cublasLtMatrixLayoutDestroy(Adesc));
}
if (matmulDesc) {
check_cuda_error(cublasLtMatmulDescDestroy(matmulDesc));
}
mu_->unlock();
}
template<bool RELU, bool GELU>
void cublasFP8MMWrapper::Conv1x1Gemm(__nv_fp8_e4m3* res,
int m,
int n,
int k,
const __nv_fp8_e4m3* input,
const __nv_fp8_e4m3* kernel,
const __nv_bfloat16* bias,
const float input_scale,
const float kernel_scale,
const float output_scale,
cudaStream_t stream)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
mu_->lock();
size_t workspace_size = 0;
// get workspace size
qgmmaLauncher.getWorkSpaceSize<RELU, GELU>(n, workspace_size);
if (workspace_size > CUBLAS_WORKSPACE_1MB) {
throw std::runtime_error("Need to rellocate workspace for qgemm. It is not supported");
// cublas_workspace_qgemm_ = allocator_->reMalloc(cublas_workspace_qgemm_, workspace_size);
}
qgmmaLauncher.invokeQgmma1x1<RELU, GELU>(
res, m, n, k, input, kernel, bias, input_scale, kernel_scale, output_scale, cublas_workspace_qgemm_, stream);
sync_check_cuda_error();
mu_->unlock();
}
template void cublasFP8MMWrapper::Conv1x1Gemm<true, false>(__nv_fp8_e4m3* res,
int m,
int n,
int k,
const __nv_fp8_e4m3* input,
const __nv_fp8_e4m3* kernel,
const __nv_bfloat16* bias,
const float input_scale,
const float kernel_scale,
const float output_scale,
cudaStream_t stream);
template void cublasFP8MMWrapper::Conv1x1Gemm<true, true>(__nv_fp8_e4m3* res,
int m,
int n,
int k,
const __nv_fp8_e4m3* input,
const __nv_fp8_e4m3* kernel,
const __nv_bfloat16* bias,
const float input_scale,
const float kernel_scale,
const float output_scale,
cudaStream_t stream);
template void cublasFP8MMWrapper::Conv1x1Gemm<false, false>(__nv_fp8_e4m3* res,
int m,
int n,
int k,
const __nv_fp8_e4m3* input,
const __nv_fp8_e4m3* kernel,
const __nv_bfloat16* bias,
const float input_scale,
const float kernel_scale,
const float output_scale,
cudaStream_t stream);
template void cublasFP8MMWrapper::Conv1x1Gemm<false, true>(__nv_fp8_e4m3* res,
int m,
int n,
int k,
const __nv_fp8_e4m3* input,
const __nv_fp8_e4m3* kernel,
const __nv_bfloat16* bias,
const float input_scale,
const float kernel_scale,
const float output_scale,
cudaStream_t stream);
template<bool RELU, bool GELU>
void cublasFP8MMWrapper::Gemm_Bias_Act(__nv_bfloat16* res,
int batchCount,
int m,
int n,
int k,
int64_t strideA,
int64_t strideB,
int64_t strideD,
const float* alpha,
const float* beta,
const __nv_fp8_e4m3* input,
const __nv_fp8_e4m3* kernel,
const float* input_scale,
const float* kernel_scale,
const __nv_bfloat16* bias,
const float* output_scale,
cudaStream_t stream)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
mu_->lock();
const void* devAscalePtr = (const void*)kernel_scale;
const void* devBscalePtr = (const void*)input_scale;
const void* devDscalePtr = (const void*)output_scale;
const size_t wsSizeBytes = CUBLAS_WORKSPACE_SIZE;
const auto aType = CUDA_R_8F_E4M3;
const auto bType = CUDA_R_8F_E4M3;
const auto dType = CUDA_R_16BF;
const auto computeType = CUBLAS_COMPUTE_32F;
const auto scaleType = CUDA_R_32F;
// const auto epilogueAuxType = CUDA_R_16BF;
const cublasOperation_t tA = CUBLAS_OP_T;
const cublasOperation_t tB = CUBLAS_OP_N;
//------- init, desc & tensors
cublasLtMatmulDesc_t matmulDesc;
cublasLtMatrixLayout_t Adesc;
cublasLtMatrixLayout_t Bdesc;
cublasLtMatrixLayout_t Ddesc;
{
check_cuda_error(cublasLtMatmulDescCreate(&matmulDesc, computeType, scaleType));
check_cuda_error(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &tA, sizeof(tA)));
check_cuda_error(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &tB, sizeof(tB)));
if (version_major_ >= 11 && version_minor_ >= 11 && version_patch_ > 0) {
const int8_t fastAccuMode = 1; // enable fast imprecise accum
check_cuda_error(cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(decltype(fastAccuMode))));
}
// TODO: Check that do we need to set these attributes
// TODO: comment them for compiler first
check_cuda_error(cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &devAscalePtr, sizeof(devAscalePtr)));
check_cuda_error(cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &devBscalePtr, sizeof(devBscalePtr)));
cublasLtEpilogue_t epi = CUBLASLT_EPILOGUE_BIAS;
if (RELU == true) {
epi = CUBLASLT_EPILOGUE_RELU_BIAS;
}
else if (GELU == true) {
epi = CUBLASLT_EPILOGUE_GELU_BIAS;
}
// cublasLtEpilogue_t epi = CUBLASLT_EPILOGUE_BIAS;
cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epi, sizeof(cublasLtEpilogue_t));
cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(const void*));
}
{
const int64_t lda = k;
const int64_t ldb = k;
const int64_t ldd = n;
// create matrix descriptors, we are good with the details here so no need
// to set any extra attributes
check_cuda_error(
cublasLtMatrixLayoutCreate(&Adesc, aType, tA == CUBLAS_OP_N ? n : k, tA == CUBLAS_OP_N ? k : n, lda));
if (batchCount > 1) {
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideA, sizeof(strideA)));
}
check_cuda_error(
cublasLtMatrixLayoutCreate(&Bdesc, bType, tB == CUBLAS_OP_N ? k : m, tB == CUBLAS_OP_N ? m : k, ldb));
if (batchCount > 1) {
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideB, sizeof(strideB)));
}
check_cuda_error(cublasLtMatrixLayoutCreate(&Ddesc, dType, n, m, ldd));
if (batchCount > 1) {
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Ddesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Ddesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
}
}
const int requestedAlgoCount = 1;
cublasLtMatmulHeuristicResult_t heuristicResult;
cublasLtMatmulPreference_t preference;
int returnedAlgoCount = -1;
check_cuda_error(cublasLtMatmulPreferenceCreate(&preference));
check_cuda_error(cublasLtMatmulPreferenceSetAttribute(
preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &wsSizeBytes, sizeof(wsSizeBytes)));
check_cuda_error(cublasLtMatmulAlgoGetHeuristic(cublaslt_handle_,
matmulDesc,
Adesc,
Bdesc,
Ddesc,
Ddesc,
preference,
requestedAlgoCount,
&heuristicResult,
&returnedAlgoCount));
{
cublasStatus_t status = cublasLtMatmul(cublaslt_handle_,
matmulDesc,
alpha,
kernel,
Adesc,
input,
Bdesc,
beta,
res,
Ddesc,
res,
Ddesc,
&heuristicResult.algo,
cublas_workspace_,
wsSizeBytes,
stream);
check_cuda_error(status);
}
if (Ddesc) {
check_cuda_error(cublasLtMatrixLayoutDestroy(Ddesc));
}
if (Bdesc) {
check_cuda_error(cublasLtMatrixLayoutDestroy(Bdesc));
}
if (Adesc) {
check_cuda_error(cublasLtMatrixLayoutDestroy(Adesc));
}
if (matmulDesc) {
check_cuda_error(cublasLtMatmulDescDestroy(matmulDesc));
}
mu_->unlock();
}
template<bool RELU, bool GELU>
void cublasFP8MMWrapper::Gemm_Bias_Act(__nv_fp8_e4m3* res,
int batchCount,
int m,
int n,
int k,
int64_t strideA,
int64_t strideB,
int64_t strideD,
const float* alpha,
const float* beta,
const __nv_fp8_e4m3* input,
const __nv_fp8_e4m3* kernel,
const float* input_scale,
const float* kernel_scale,
const __nv_bfloat16* bias,
const float* output_scale,
cudaStream_t stream)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
mu_->lock();
const void* devAscalePtr = (const void*)kernel_scale;
const void* devBscalePtr = (const void*)input_scale;
const void* devDscalePtr = (const void*)output_scale;
const size_t wsSizeBytes = CUBLAS_WORKSPACE_SIZE;
const auto aType = CUDA_R_8F_E4M3;
const auto bType = CUDA_R_8F_E4M3;
const auto cType = CUDA_R_16BF;
const auto dType = CUDA_R_8F_E4M3;
const auto computeType = CUBLAS_COMPUTE_32F;
const auto scaleType = CUDA_R_32F;
// const auto epilogueAuxType = CUDA_R_16BF;
const cublasOperation_t tA = CUBLAS_OP_T;
const cublasOperation_t tB = CUBLAS_OP_N;
//------- init, desc & tensors
cublasLtMatmulDesc_t matmulDesc;
cublasLtMatrixLayout_t Adesc;
cublasLtMatrixLayout_t Bdesc;
cublasLtMatrixLayout_t Cdesc;
cublasLtMatrixLayout_t Ddesc;
{
check_cuda_error(cublasLtMatmulDescCreate(&matmulDesc, computeType, scaleType));
check_cuda_error(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &tA, sizeof(tA)));
check_cuda_error(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &tB, sizeof(tB)));
if (version_major_ >= 11 && version_minor_ >= 11 && version_patch_ > 0) {
const int8_t fastAccuMode = 1; // enable fast imprecise accum
check_cuda_error(cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(decltype(fastAccuMode))));
}
// TODO: Check that do we need to set these attributes
// TODO: comment them for compiler first
check_cuda_error(cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &devAscalePtr, sizeof(devAscalePtr)));
check_cuda_error(cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &devBscalePtr, sizeof(devBscalePtr)));
check_cuda_error(cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, &devDscalePtr, sizeof(devDscalePtr)));
cublasLtEpilogue_t epi = CUBLASLT_EPILOGUE_GELU_BIAS;
// cublasLtEpilogue_t epi = CUBLASLT_EPILOGUE_BIAS;
cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epi, sizeof(cublasLtEpilogue_t));
cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(const void*));
}
{
const int64_t lda = k;
const int64_t ldb = k;
const int64_t ldd = n;
// create matrix descriptors, we are good with the details here so no need
// to set any extra attributes
check_cuda_error(
cublasLtMatrixLayoutCreate(&Adesc, aType, tA == CUBLAS_OP_N ? n : k, tA == CUBLAS_OP_N ? k : n, lda));
if (batchCount > 1) {
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideA, sizeof(strideA)));
}
check_cuda_error(
cublasLtMatrixLayoutCreate(&Bdesc, bType, tB == CUBLAS_OP_N ? k : m, tB == CUBLAS_OP_N ? m : k, ldb));
if (batchCount > 1) {
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideB, sizeof(strideB)));
}
check_cuda_error(cublasLtMatrixLayoutCreate(&Cdesc, cType, n, m, ldd));
// (TODO Hongbinl)Not sure if the implementation makes sense
if (batchCount > 1) {
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
}
check_cuda_error(cublasLtMatrixLayoutCreate(&Ddesc, dType, n, m, ldd));
if (batchCount > 1) {
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Ddesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Ddesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
}
}
const int requestedAlgoCount = 1;
cublasLtMatmulHeuristicResult_t heuristicResult;
cublasLtMatmulPreference_t preference;
int returnedAlgoCount = -1;
check_cuda_error(cublasLtMatmulPreferenceCreate(&preference));
check_cuda_error(cublasLtMatmulPreferenceSetAttribute(
preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &wsSizeBytes, sizeof(wsSizeBytes)));
#if (CUBLAS_VERSION) <= 12000
uint32_t pointer_mode_mask = 0;
check_cuda_error(cublasLtMatmulPreferenceSetAttribute(
preference, CUBLASLT_MATMUL_PREF_EPILOGUE_MASK, &pointer_mode_mask, sizeof(pointer_mode_mask)));
#endif
check_cuda_error(cublasLtMatmulAlgoGetHeuristic(cublaslt_handle_,
matmulDesc,
Adesc,
Bdesc,
Cdesc,
Ddesc,
preference,
requestedAlgoCount,
&heuristicResult,
&returnedAlgoCount));
{
cublasStatus_t status = cublasLtMatmul(cublaslt_handle_,
matmulDesc,
alpha,
kernel,
Adesc,
input,
Bdesc,
beta,
res,
Cdesc,
res,
Ddesc,
&heuristicResult.algo,
cublas_workspace_,
wsSizeBytes,
stream);
check_cuda_error(status);
}
if (Ddesc) {
check_cuda_error(cublasLtMatrixLayoutDestroy(Ddesc));
}
if (Bdesc) {
check_cuda_error(cublasLtMatrixLayoutDestroy(Bdesc));
}
if (Adesc) {
check_cuda_error(cublasLtMatrixLayoutDestroy(Adesc));
}
if (matmulDesc) {
check_cuda_error(cublasLtMatmulDescDestroy(matmulDesc));
}
mu_->unlock();
}
template void cublasFP8MMWrapper::Gemm_Bias_Act<false, true>(__nv_bfloat16* res,
int batchCount,
int m,
int n,
int k,
int64_t strideA,
int64_t strideB,
int64_t strideD,
const float* alpha,
const float* beta,
const __nv_fp8_e4m3* input,
const __nv_fp8_e4m3* kernel,
const float* input_scale,
const float* kernel_scale,
const __nv_bfloat16* bias,
const float* output_scale,
cudaStream_t stream);
template void cublasFP8MMWrapper::Gemm_Bias_Act<false, true>(__nv_fp8_e4m3* res,
int batchCount,
int m,
int n,
int k,
int64_t strideA,
int64_t strideB,
int64_t strideD,
const float* alpha,
const float* beta,
const __nv_fp8_e4m3* input,
const __nv_fp8_e4m3* kernel,
const float* input_scale,
const float* kernel_scale,
const __nv_bfloat16* bias,
const float* output_scale,
cudaStream_t stream);
template void cublasFP8MMWrapper::Gemm_Bias_Act<true, false>(__nv_bfloat16* res,
int batchCount,
int m,
int n,
int k,
int64_t strideA,
int64_t strideB,
int64_t strideD,
const float* alpha,
const float* beta,
const __nv_fp8_e4m3* input,
const __nv_fp8_e4m3* kernel,
const float* input_scale,
const float* kernel_scale,
const __nv_bfloat16* bias,
const float* output_scale,
cudaStream_t stream);
template void cublasFP8MMWrapper::Gemm_Bias_Act<true, false>(__nv_fp8_e4m3* res,
int batchCount,
int m,
int n,
int k,
int64_t strideA,
int64_t strideB,
int64_t strideD,
const float* alpha,
const float* beta,
const __nv_fp8_e4m3* input,
const __nv_fp8_e4m3* kernel,
const float* input_scale,
const float* kernel_scale,
const __nv_bfloat16* bias,
const float* output_scale,
cudaStream_t stream);
template void cublasFP8MMWrapper::Gemm_Bias_Act<false, false>(__nv_fp8_e4m3* res,
int batchCount,
int m,
int n,
int k,
int64_t strideA,
int64_t strideB,
int64_t strideD,
const float* alpha,
const float* beta,
const __nv_fp8_e4m3* input,
const __nv_fp8_e4m3* kernel,
const float* input_scale,
const float* kernel_scale,
const __nv_bfloat16* bias,
const float* output_scale,
cudaStream_t stream);
} // namespace fastertransformer
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "3rdparty/fp8_qgmma_1x1/fp8_qgmma_1x1_utils.h"
#include "cuda_utils.h"
#include "src/fastertransformer/utils/cublasAlgoMap.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h"
#include "src/fastertransformer/utils/cuda_fp8_utils.h"
#include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <map>
#include <mutex>
#include <string>
#pragma once
namespace fastertransformer {
class cublasFP8MMWrapper: public cublasMMWrapper {
public:
cublasFP8MMWrapper(cublasLtHandle_t cublaslt_handle_,
cudaStream_t stream,
cublasAlgoMap* map,
std::mutex* mu,
IAllocator* allocator);
cublasFP8MMWrapper(cublasHandle_t cublas_handle,
cublasLtHandle_t cublaslt_handle,
cudaStream_t stream,
cublasAlgoMap* map,
std::mutex* mu,
IAllocator* allocator);
virtual ~cublasFP8MMWrapper();
cublasFP8MMWrapper(const cublasFP8MMWrapper& wrapper);
virtual void cublasVersionCheck() override;
void Gemm(__nv_bfloat16* res,
int batchCount,
int m,
int n,
int k,
int64_t stridea,
int64_t strideb,
int64_t stridec,
const float* alpha,
const float* beta,
const __nv_fp8_e4m3* input,
const __nv_fp8_e4m3* kernel,
const float* input_scale,
const float* kernel_scale);
void Gemm(__nv_bfloat16* res,
int batchCount,
int m,
int n,
int k,
int64_t stridea,
int64_t strideb,
int64_t stridec,
const float* alpha,
const float* beta,
const __nv_fp8_e4m3* input,
const __nv_fp8_e4m3* kernel,
const float* input_scale,
const float* kernel_scale,
cudaStream_t stream,
bool fastAccum = true);
void Gemm(__nv_fp8_e4m3* res,
int batchCount,
int m,
int n,
int k,
int64_t stridea,
int64_t strideb,
int64_t stridec,
const float* alpha,
const float* beta,
const __nv_fp8_e4m3* input,
const __nv_fp8_e4m3* kernel,
const float* input_scale,
const float* kernel_scale,
const float* output_scale);
void Gemm(__nv_fp8_e4m3* res,
int batchCount,
int m,
int n,
int k,
int64_t stridea,
int64_t strideb,
int64_t stridec,
const float* alpha,
const float* beta,
const __nv_fp8_e4m3* input,
const __nv_fp8_e4m3* kernel,
const float* input_scale,
const float* kernel_scale,
const float* output_scale,
cudaStream_t stream,
bool fastAccum = true);
template<bool RELU, bool GELU>
void Conv1x1Gemm(__nv_fp8_e4m3* res,
int m,
int n,
int k,
const __nv_fp8_e4m3* input,
const __nv_fp8_e4m3* kernel,
const __nv_bfloat16* bias,
const float input_scale,
const float kernel_scale,
const float output_scale,
cudaStream_t stream);
template<bool RELU, bool GELU>
void Gemm_Bias_Act(__nv_bfloat16* res,
int batchCount,
int m,
int n,
int k,
int64_t stridea,
int64_t strideb,
int64_t stridec,
const float* alpha,
const float* beta,
const __nv_fp8_e4m3* input,
const __nv_fp8_e4m3* kernel,
const float* input_scale,
const float* kernel_scale,
const __nv_bfloat16* bias,
const float* output_scale,
cudaStream_t stream);
template<bool RELU, bool GELU>
void Gemm_Bias_Act(__nv_fp8_e4m3* res,
int batchCount,
int m,
int n,
int k,
int64_t stridea,
int64_t strideb,
int64_t stridec,
const float* alpha,
const float* beta,
const __nv_fp8_e4m3* input,
const __nv_fp8_e4m3* kernel,
const float* input_scale,
const float* kernel_scale,
const __nv_bfloat16* bias,
const float* output_scale,
cudaStream_t stream);
private:
int version_major_, version_minor_, version_patch_;
fastertransformer::qgmma1x1Launcher qgmmaLauncher;
void* cublas_workspace_qgemm_ = nullptr;
};
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "cublasINT8MMWrapper.h"
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#endif
namespace fastertransformer {
cublasINT8MMWrapper::cublasINT8MMWrapper(cublasLtHandle_t cublaslt_handle,
cudaStream_t stream,
cublasAlgoMap* cublas_algo_map,
std::mutex* mu,
bool use_ORDER_COL32_2R_4R4):
cublasMMWrapper(nullptr, cublaslt_handle, stream, cublas_algo_map, mu, nullptr),
use_ORDER_COL32_2R_4R4_(use_ORDER_COL32_2R_4R4)
{
}
cublasINT8MMWrapper::cublasINT8MMWrapper(cublasHandle_t cublas_handle,
cublasLtHandle_t cublaslt_handle,
cudaStream_t stream,
cublasAlgoMap* cublas_algo_map,
std::mutex* mu,
bool use_ORDER_COL32_2R_4R4):
cublasMMWrapper(cublas_handle, cublaslt_handle, stream, cublas_algo_map, mu, nullptr),
use_ORDER_COL32_2R_4R4_(use_ORDER_COL32_2R_4R4)
{
}
#ifdef SPARSITY_ENABLED
cublasINT8MMWrapper::cublasINT8MMWrapper(cublasLtHandle_t cublaslt_handle,
cusparseLtHandle_t cusparselt_handle,
cudaStream_t stream,
cublasAlgoMap* cublas_algo_map,
std::mutex* mu,
bool use_ORDER_COL32_2R_4R4):
cublasMMWrapper(nullptr, cublaslt_handle, cusparselt_handle, stream, cublas_algo_map, mu, nullptr),
use_ORDER_COL32_2R_4R4_(use_ORDER_COL32_2R_4R4)
{
}
#endif
cublasINT8MMWrapper::~cublasINT8MMWrapper()
{
mu_ = nullptr;
}
cublasINT8MMWrapper::cublasINT8MMWrapper(const cublasINT8MMWrapper& wrapper):
#ifdef SPARSITY_ENABLED
cublasMMWrapper(nullptr,
wrapper.cublaslt_handle_,
wrapper.cusparselt_handle_,
wrapper.stream_,
wrapper.cublas_algo_map_,
wrapper.mu_,
wrapper.allocator_),
#else
cublasMMWrapper(
nullptr, wrapper.cublaslt_handle_, wrapper.stream_, wrapper.cublas_algo_map_, wrapper.mu_, wrapper.allocator_),
#endif
use_ORDER_COL32_2R_4R4_(wrapper.use_ORDER_COL32_2R_4R4_)
{
}
// for int8 cublasLtMM with algo
// ATransform should be m*n, CUBLASLT_ORDER_COL32
// kernel should be n*k, CUBLASLT_ORDER_COL4_4R2_8C or CUBLASLT_ORDER_COL32_2R_4R4
// res is m*n, CUBLASLT_ORDER_COL32
void cublasINT8MMWrapper::Gemm(int* res,
int batchCount,
int m,
int n,
int k,
int64_t stridea,
int64_t strideb,
int64_t stridec,
const int8_t* ATransform,
const int8_t* kernel)
{
mu_->lock();
cublasOperation_t opTranspose = CUBLAS_OP_T;
#if (CUDART_VERSION >= 11000)
cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
#else
cudaDataType_t computeType = CUDA_R_32I;
#endif
cublasLtMatmulDesc_t matmulDesc;
cublasLtMatrixLayout_t AtransformDesc = NULL;
cublasLtMatrixLayout_t BtransformDesc = NULL;
cublasLtMatrixLayout_t CtransformDesc = NULL;
cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32;
cublasLtOrder_t order_matrixB;
#if (CUDART_VERSION >= 11000)
if (use_ORDER_COL32_2R_4R4_) {
order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
}
else {
order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
}
#else
order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
#endif
int ldaTransform = 32 * m;
int ldbTransform;
if (use_ORDER_COL32_2R_4R4_) {
ldbTransform = 32 * ((n + 32 - 1) / 32) * 32;
}
else {
ldbTransform = 32 * ((n + 8 - 1) / 8) * 8;
}
int ldcTransform = 32 * m;
// create matmulDesc
#if (CUDART_VERSION >= 11000)
cublasLtMatmulDescCreate(&matmulDesc, computeType, CUDA_R_32I);
#else
cublasLtMatmulDescCreate(&matmulDesc, computeType);
#endif
cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t));
cublasLtMatrixLayoutCreate(&AtransformDesc, CUDA_R_8I, m, k, ldaTransform);
cublasLtMatrixLayoutSetAttribute(AtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
cublasLtMatrixLayoutCreate(&BtransformDesc, CUDA_R_8I, n, k, ldbTransform);
cublasLtMatrixLayoutSetAttribute(
BtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_matrixB, sizeof(order_matrixB));
cublasLtMatrixLayoutCreate(&CtransformDesc, CUDA_R_32I, m, n, ldcTransform);
cublasLtMatrixLayoutSetAttribute(CtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
if (batchCount > 1) {
cublasLtMatrixLayoutSetAttribute(
AtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
cublasLtMatrixLayoutSetAttribute(
AtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, sizeof(stridea));
cublasLtMatrixLayoutSetAttribute(
BtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
cublasLtMatrixLayoutSetAttribute(
BtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, sizeof(strideb));
cublasLtMatrixLayoutSetAttribute(
CtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
cublasLtMatrixLayoutSetAttribute(
CtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, sizeof(stridec));
}
int alphaI = 1;
int betaI = 0;
// get algo
cublasLtMatmulAlgo_t algo;
int findAlgo = 0;
if (cublas_algo_map_->isExist(batchCount, m, n, k, INT8_DATATYPE)) {
// printf("find algo %s\n", markStr.c_str());
findAlgo = 1;
cublasLtMatmulAlgo_info tmp_info = cublas_algo_map_->getAlgo(batchCount, m, n, k, INT8_DATATYPE);
cublasLtMatmulAlgoInit(cublaslt_handle_,
computeType,
CUDA_R_32I,
CUDA_R_8I,
CUDA_R_8I,
CUDA_R_32I,
CUDA_R_32I,
tmp_info.algoId,
&algo);
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(tmp_info.customOption), sizeof(tmp_info.customOption));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tmp_info.tile), sizeof(tmp_info.tile));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(tmp_info.splitK_val), sizeof(tmp_info.splitK_val));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(tmp_info.swizzle), sizeof(tmp_info.swizzle));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(tmp_info.reductionScheme), sizeof(int));
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(tmp_info.stages), sizeof(tmp_info.stages));
#endif
}
else {
findAlgo = 1;
int algoId;
if (use_ORDER_COL32_2R_4R4_) {
algoId = 7;
}
else {
algoId = 6;
}
int swizzle = 0;
int customOption = 0;
int tile = 20;
int splitK_val = 0;
int reductionScheme = 0;
cublasLtMatmulAlgoInit(
cublaslt_handle_, computeType, CUDA_R_32I, CUDA_R_8I, CUDA_R_8I, CUDA_R_32I, CUDA_R_32I, algoId, &algo);
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(customOption), sizeof(customOption));
cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tile), sizeof(tile));
cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(splitK_val), sizeof(splitK_val));
cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(swizzle), sizeof(swizzle));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(reductionScheme), sizeof(int));
#if (CUDART_VERSION >= 11000)
int stages;
if (use_ORDER_COL32_2R_4R4_) {
stages = 15;
}
else {
stages = 13;
}
cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(stages), sizeof(stages));
#endif
}
cublasLtMatmul(cublaslt_handle_,
matmulDesc,
&alphaI,
ATransform,
AtransformDesc,
kernel,
BtransformDesc,
&betaI,
res,
CtransformDesc,
res,
CtransformDesc,
(findAlgo == 1 ? (&algo) : NULL),
NULL,
0,
stream_);
cublasLtMatmulDescDestroy(matmulDesc);
cublasLtMatrixLayoutDestroy(AtransformDesc);
cublasLtMatrixLayoutDestroy(BtransformDesc);
cublasLtMatrixLayoutDestroy(CtransformDesc);
sync_check_cuda_error();
mu_->unlock();
}
// for int8 IO cublasLtMM with algo
// ATransform should be m*k CUBLASLT_ORDER_COL32
// kernel should be n*k CUBLASLT_ORDER_COL4_4R2_8C
// res is m*n CUBLASLT_ORDER_COL32
void cublasINT8MMWrapper::Gemm(int8_t* res,
int batchCount,
int m,
int n,
int k,
int64_t stridea,
int64_t strideb,
int64_t stridec,
const float alpha,
const int8_t* ATransform,
const int8_t* kernel)
{
mu_->lock();
cublasOperation_t opTranspose = CUBLAS_OP_T;
// int8 gemm does not support CUBLAS_POINTER_MODE_DEVICE
// cublasLtPointerMode_t pointerMode = CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO;
cudaDataType_t scaleType = CUDA_R_32F;
#if (CUDART_VERSION >= 11000)
cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
#else
cudaDataType_t computeType = CUDA_R_32I;
#endif
cublasLtMatmulDesc_t matmulDesc;
cublasLtMatrixLayout_t AtransformDesc = NULL;
cublasLtMatrixLayout_t BtransformDesc = NULL;
cublasLtMatrixLayout_t CtransformDesc = NULL;
cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32;
cublasLtOrder_t order_matrixB;
#if (CUDART_VERSION >= 11000)
if (use_ORDER_COL32_2R_4R4_) {
order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
}
else {
order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
}
#else
order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
#endif
int ldaTransform = 32 * m;
int ldbTransform;
if (use_ORDER_COL32_2R_4R4_) {
ldbTransform = 32 * ((n + 32 - 1) / 32) * 32;
}
else {
ldbTransform = 32 * ((n + 8 - 1) / 8) * 8;
}
int ldcTransform = 32 * m;
// create matmulDesc
#if (CUDART_VERSION >= 11000)
cublasLtMatmulDescCreate(&matmulDesc, computeType, scaleType);
#else
cublasLtMatmulDescCreate(&matmulDesc, computeType);
#endif
cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t));
cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, &scaleType, sizeof(scaleType));
// cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointerMode,
// sizeof(cublasLtPointerMode_t));
cublasLtMatrixLayoutCreate(&AtransformDesc, CUDA_R_8I, m, k, ldaTransform);
cublasLtMatrixLayoutSetAttribute(AtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
cublasLtMatrixLayoutCreate(&BtransformDesc, CUDA_R_8I, n, k, ldbTransform);
cublasLtMatrixLayoutSetAttribute(
BtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_matrixB, sizeof(order_matrixB));
cublasLtMatrixLayoutCreate(&CtransformDesc, CUDA_R_8I, m, n, ldcTransform);
cublasLtMatrixLayoutSetAttribute(CtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
if (batchCount > 1) {
cublasLtMatrixLayoutSetAttribute(
AtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
cublasLtMatrixLayoutSetAttribute(
AtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, sizeof(stridea));
cublasLtMatrixLayoutSetAttribute(
BtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
cublasLtMatrixLayoutSetAttribute(
BtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, sizeof(strideb));
cublasLtMatrixLayoutSetAttribute(
CtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
cublasLtMatrixLayoutSetAttribute(
CtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, sizeof(stridec));
}
// get algo
cublasLtMatmulAlgo_t algo;
int findAlgo = 0;
if (cublas_algo_map_->isExist(batchCount, m, n, k, INT8_DATATYPE)) {
findAlgo = 1;
cublasLtMatmulAlgo_info tmp_info = cublas_algo_map_->getAlgo(batchCount, m, n, k, INT8_DATATYPE);
cublasLtMatmulAlgoInit(cublaslt_handle_,
computeType,
CUDA_R_32F,
CUDA_R_8I,
CUDA_R_8I,
CUDA_R_8I,
CUDA_R_8I,
tmp_info.algoId,
&algo);
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(tmp_info.customOption), sizeof(tmp_info.customOption));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tmp_info.tile), sizeof(tmp_info.tile));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(tmp_info.splitK_val), sizeof(tmp_info.splitK_val));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(tmp_info.swizzle), sizeof(tmp_info.swizzle));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(tmp_info.reductionScheme), sizeof(int));
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(tmp_info.stages), sizeof(tmp_info.stages));
#endif
}
else {
findAlgo = 1;
int algoId;
if (use_ORDER_COL32_2R_4R4_) {
algoId = 7;
}
else {
algoId = 6;
}
int swizzle = 0;
int customOption = 0;
int tile = 20;
int splitK_val = 0;
int reductionScheme = 0;
cublasLtMatmulAlgoInit(
cublaslt_handle_, computeType, CUDA_R_32F, CUDA_R_8I, CUDA_R_8I, CUDA_R_8I, CUDA_R_8I, algoId, &algo);
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(customOption), sizeof(customOption));
cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tile), sizeof(tile));
cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(splitK_val), sizeof(splitK_val));
cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(swizzle), sizeof(swizzle));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(reductionScheme), sizeof(int));
#if (CUDART_VERSION >= 11000)
int stages;
if (use_ORDER_COL32_2R_4R4_) {
stages = 15;
}
else {
stages = 13;
}
cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(stages), sizeof(stages));
#endif
}
float beta = 0.0f;
cublasLtMatmul(cublaslt_handle_,
matmulDesc,
&alpha,
ATransform,
AtransformDesc,
kernel,
BtransformDesc,
&beta,
res,
CtransformDesc,
res,
CtransformDesc,
(findAlgo == 1 ? (&algo) : NULL),
NULL,
0,
stream_);
cublasLtMatmulDescDestroy(matmulDesc);
cublasLtMatrixLayoutDestroy(AtransformDesc);
cublasLtMatrixLayoutDestroy(BtransformDesc);
cublasLtMatrixLayoutDestroy(CtransformDesc);
sync_check_cuda_error();
mu_->unlock();
}
template<typename T>
int cublasINT8MMWrapper::getFusedINT8QKVType(const int k, const int n, const AttentionWeight<T>* attention_weights)
{
int fusedINT8QKV_type = 0;
const int8_t* Q_weight = (const int8_t*)(attention_weights->query_weight.kernel);
const int8_t* K_weight = (const int8_t*)(attention_weights->key_weight.kernel);
const int8_t* V_weight = (const int8_t*)(attention_weights->value_weight.kernel);
// for QKV weight are DataType_ & continue
if ((attention_weights->query_weight.kernel + n * k == attention_weights->key_weight.kernel)
&& (attention_weights->key_weight.kernel + n * k == attention_weights->value_weight.kernel)) {
fusedINT8QKV_type = 1;
}
// for QVK weight are int8 & continue
else if ((Q_weight + n * k == K_weight) && (K_weight + n * k == V_weight)) {
fusedINT8QKV_type = 2;
}
return fusedINT8QKV_type;
}
bool cublasINT8MMWrapper::getUseOrderCol322R4R4()
{
return use_ORDER_COL32_2R_4R4_;
}
template int
cublasINT8MMWrapper::getFusedINT8QKVType(const int k, const int n, const AttentionWeight<float>* attention_weights);
template int
cublasINT8MMWrapper::getFusedINT8QKVType(const int k, const int n, const AttentionWeight<half>* attention_weights);
#ifdef SPARSITY_ENABLED
// A is sparse weight [m,k], non transposed row major
// B is activation input [k, n], non transposed col major
void cublasINT8MMWrapper::SpGemm(
const int m, const int n, const int k, const float alpha, const void* A, const void* B, void* C)
{
cudaDataType_t Atype = CUDA_R_8I;
cudaDataType_t Btype = CUDA_R_8I;
cudaDataType_t Ctype = CUDA_R_8I;
cusparseComputeType compute_type = CUSPARSE_COMPUTE_32I;
cusparseOrder_t col_order = CUSPARSE_ORDER_COL;
cusparseOrder_t row_order = CUSPARSE_ORDER_ROW;
cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE;
cusparseLtMatmulDescriptor_t matmul;
cusparseLtMatmulAlgSelection_t alg_sel;
cusparseLtMatmulPlan_t plan;
auto num_A_rows = m;
auto num_A_cols = k;
auto num_B_rows = k;
auto num_B_cols = n;
auto num_C_rows = m;
auto num_C_cols = n;
unsigned alignment = 16;
auto lda = num_A_cols;
auto ldb = num_B_rows;
auto ldc = num_C_rows;
float _beta(0.0f);
char mark[256];
sprintf(mark, "%d_%d_%d_%d", 1, m, n, k);
if (sp_mat_A_desc_map_.find(mark) != sp_mat_A_desc_map_.end()) {
CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_,
&matmul,
opA,
opB,
&sp_mat_A_desc_map_[mark],
&sp_mat_B_desc_map_[mark],
&sp_mat_C_desc_map_[mark],
&sp_mat_C_desc_map_[mark],
compute_type))
}
else {
// initializing MatDesc takes a lot of time
cusparseLtMatDescriptor_t matA, matB, matC;
sp_mat_A_desc_map_[mark] = matA;
sp_mat_B_desc_map_[mark] = matB;
sp_mat_C_desc_map_[mark] = matC;
CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_,
&sp_mat_A_desc_map_[mark],
num_A_rows,
num_A_cols,
lda,
alignment,
Atype,
row_order,
CUSPARSELT_SPARSITY_50_PERCENT))
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(
&cusparselt_handle_, &sp_mat_B_desc_map_[mark], num_B_rows, num_B_cols, ldb, alignment, Btype, col_order))
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(
&cusparselt_handle_, &sp_mat_C_desc_map_[mark], num_C_rows, num_C_cols, ldc, alignment, Ctype, col_order))
CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_,
&matmul,
opA,
opB,
&sp_mat_A_desc_map_[mark],
&sp_mat_B_desc_map_[mark],
&sp_mat_C_desc_map_[mark],
&sp_mat_C_desc_map_[mark],
compute_type))
}
mu_->lock();
CHECK_CUSPARSE(
cusparseLtMatmulAlgSelectionInit(&cusparselt_handle_, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
int alg = cublas_algo_map_->getSpAlgo(1, num_A_rows, num_B_cols, num_A_cols);
CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
&cusparselt_handle_, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)))
size_t workspace_size;
CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&cusparselt_handle_, &alg_sel, &workspace_size))
CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&cusparselt_handle_, &plan, &matmul, &alg_sel, workspace_size))
void* d_workspace = nullptr;
int num_streams = 1;
cudaStream_t streams[1] = {stream_};
CHECK_CUSPARSE(
cusparseLtMatmul(&cusparselt_handle_, &plan, &alpha, A, B, &_beta, C, C, d_workspace, streams, num_streams))
CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
sync_check_cuda_error();
mu_->unlock();
}
#endif
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "cuda_utils.h"
#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
#include "src/fastertransformer/utils/cublasAlgoMap.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h"
#include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <map>
#include <mutex>
#include <string>
#pragma once
namespace fastertransformer {
class cublasINT8MMWrapper: public cublasMMWrapper {
private:
bool use_ORDER_COL32_2R_4R4_;
public:
cublasINT8MMWrapper(cublasLtHandle_t cublaslt_handle_,
cudaStream_t stream,
cublasAlgoMap* map,
std::mutex* mu,
bool use_ORDER_COL32_2R_4R4);
cublasINT8MMWrapper(cublasHandle_t cublas_handle,
cublasLtHandle_t cublaslt_handle,
cudaStream_t stream,
cublasAlgoMap* map,
std::mutex* mu,
bool use_ORDER_COL32_2R_4R4);
#ifdef SPARSITY_ENABLED
cublasINT8MMWrapper(cublasLtHandle_t cublaslt_handle_,
cusparseLtHandle_t cusparselt_handle,
cudaStream_t stream,
cublasAlgoMap* map,
std::mutex* mu,
bool use_ORDER_COL32_2R_4R4);
#endif
~cublasINT8MMWrapper();
cublasINT8MMWrapper(const cublasINT8MMWrapper& wrapper);
void Gemm(int* res,
int batchCount,
int m,
int n,
int k,
int64_t stridea,
int64_t strideb,
int64_t stridec,
const int8_t* ATransform,
const int8_t* kernel);
void Gemm(int8_t* res,
int batchCount,
int m,
int n,
int k,
int64_t stridea,
int64_t strideb,
int64_t stridec,
const float alpha,
const int8_t* ATransform,
const int8_t* kernel);
template<typename T>
int getFusedINT8QKVType(const int k, const int n, const AttentionWeight<T>* attention_weights);
bool getUseOrderCol322R4R4();
#ifdef SPARSITY_ENABLED
void SpGemm(const int m, const int n, const int k, const float alpha, const void* A, const void* B, void* C);
#endif
};
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "cublasMMWrapper.h"
#include "cuda_utils.h"
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#endif
namespace fastertransformer {
cublasMMWrapper::cublasMMWrapper(cublasHandle_t cublas_handle,
cublasLtHandle_t cublaslt_handle,
cudaStream_t stream,
cublasAlgoMap* cublas_algo_map,
std::mutex* mu,
IAllocator* allocator):
cublas_handle_(cublas_handle),
cublaslt_handle_(cublaslt_handle),
stream_(stream),
cublas_algo_map_(cublas_algo_map),
mu_(mu),
allocator_(allocator)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
if (allocator_ != nullptr) {
cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);
}
}
#ifdef SPARSITY_ENABLED
cublasMMWrapper::cublasMMWrapper(cublasHandle_t cublas_handle,
cublasLtHandle_t cublaslt_handle,
cusparseLtHandle_t cusparselt_handle,
cudaStream_t stream,
cublasAlgoMap* cublas_algo_map,
std::mutex* mu,
IAllocator* allocator):
cublas_handle_(cublas_handle),
cublaslt_handle_(cublaslt_handle),
cusparselt_handle_(cusparselt_handle),
stream_(stream),
cublas_algo_map_(cublas_algo_map),
mu_(mu),
allocator_(allocator)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
if (allocator_ != nullptr) {
cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);
}
}
#endif
cublasMMWrapper::~cublasMMWrapper()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
mu_ = nullptr;
if (allocator_ != nullptr) {
allocator_->free((void**)(&cublas_workspace_));
allocator_ = nullptr;
}
}
cublasMMWrapper::cublasMMWrapper(const cublasMMWrapper& wrapper):
cublas_handle_(wrapper.cublas_handle_),
cublaslt_handle_(wrapper.cublaslt_handle_),
#ifdef SPARSITY_ENABLED
cusparselt_handle_(wrapper.cusparselt_handle_),
#endif
stream_(wrapper.stream_),
cublas_algo_map_(wrapper.cublas_algo_map_),
mu_(wrapper.mu_),
allocator_(wrapper.allocator_)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
if (allocator_ != nullptr) {
cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);
}
}
void cublasMMWrapper::Gemm(cublasOperation_t transa,
cublasOperation_t transb,
const int m,
const int n,
const int k,
const void* alpha,
const void* A,
cudaDataType_t Atype,
int lda,
const void* B,
cudaDataType_t Btype,
int ldb,
const void* beta,
void* C,
cudaDataType_t Ctype,
int ldc,
cudaDataType_t computeType,
cublasGemmAlgo_t algo)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
mu_->lock();
check_cuda_error(cublasGemmEx(cublas_handle_,
transa,
transb,
m,
n,
k,
alpha,
A,
Atype,
lda,
B,
Btype,
ldb,
beta,
C,
Ctype,
ldc,
computeType,
algo));
sync_check_cuda_error();
mu_->unlock();
}
void cublasMMWrapper::Gemm(cublasOperation_t transa,
cublasOperation_t transb,
const int m,
const int n,
const int k,
const void* A,
const int lda,
const void* B,
const int ldb,
void* C,
const int ldc)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
Gemm(transa, transb, m, n, k, A, lda, B, ldb, C, ldc, 1.0f, 0.0f);
}
void cublasMMWrapper::Gemm(cublasOperation_t transa,
cublasOperation_t transb,
const int m,
const int n,
const int k,
const void* A,
const int lda,
const void* B,
const int ldb,
void* C,
const int ldc,
float f_alpha,
float f_beta)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
half h_alpha = (half)(f_alpha);
half h_beta = (half)(f_beta);
mu_->lock();
// TODO: default cublas libs
int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
bool using_cublasLt = (Atype_ == CUDA_R_16F) ? true : false;
int batch_count = 1;
// fp32 use cublas as default
// fp16 use cublasLt as default
const void* alpha = is_fp16_computeType ? reinterpret_cast<void*>(&h_alpha) : reinterpret_cast<void*>(&f_alpha);
const void* beta = is_fp16_computeType ? reinterpret_cast<void*>(&h_beta) : reinterpret_cast<void*>(&f_beta);
int findAlgo = cublas_algo_map_->isExist(batch_count, m, n, k, getCublasDataType(Atype_));
cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(Atype_));
if (findAlgo) {
if (info.stages != -1) {
using_cublasLt = true;
}
else {
using_cublasLt = false;
}
}
if (using_cublasLt) {
cublasLtMatmulDesc_t operationDesc = NULL;
cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
cudaDataType_t scaleType;
#if (CUDART_VERSION >= 11000)
cublasComputeType_t computeType;
#else
cudaDataType_t computeType;
#endif
if (is_fp16_computeType) {
#if (CUDART_VERSION >= 11000)
computeType = CUBLAS_COMPUTE_16F;
#else
computeType = CUDA_R_16F;
#endif
scaleType = CUDA_R_16F;
}
else {
#if (CUDART_VERSION >= 11000)
computeType = CUBLAS_COMPUTE_32F;
#else
computeType = CUDA_R_32F;
#endif
scaleType = CUDA_R_32F;
}
// --------------------------------------
// Create descriptors for the original matrices
cublasLtMatrixLayoutCreate(&Adesc, Atype_, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
cublasLtMatrixLayoutCreate(&Bdesc, Btype_, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
cublasLtMatrixLayoutCreate(&Cdesc, Ctype_, m, n, ldc);
#if (CUDART_VERSION >= 11000)
cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
#else
cublasLtMatmulDescCreate(&operationDesc, computeType);
#endif
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t));
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t));
cublasLtMatmulAlgo_t algo;
void* workSpace = cublas_workspace_;
int workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
if (findAlgo) {
if (info.workspaceSize > workspaceSize) {
findAlgo = 0;
}
else {
cublasLtMatmulAlgoInit(
cublaslt_handle_, computeType, scaleType, Atype_, Btype_, Ctype_, Ctype_, info.algoId, &algo);
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle));
cublasLtMatmulAlgoConfigSetAttribute(&algo,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
&(info.reductionScheme),
sizeof(info.reductionScheme));
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
#endif
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID, &(info.inner_shapeId), sizeof(info.inner_shapeId));
cublasLtMatmulAlgoConfigSetAttribute(&algo,
CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID,
&(info.cluster_shapeId),
sizeof(info.cluster_shapeId));
#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_MMA_SHAPE_ID, &(info.mma_shapeId), sizeof(info.mma_shapeId));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CGA_SHAPE_ID, &(info.cga_shapeId), sizeof(info.cga_shapeId));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_SCHEDULING_MODE, &(info.sche_mode), sizeof(info.sche_mode));
#endif
}
}
cublasLtMatmul(cublaslt_handle_,
operationDesc,
alpha,
A,
Adesc,
B,
Bdesc,
beta,
C,
Cdesc,
C,
Cdesc,
(findAlgo == 1 ? (&algo) : NULL),
workSpace,
workspaceSize,
stream_);
cublasLtMatmulDescDestroy(operationDesc);
cublasLtMatrixLayoutDestroy(Adesc);
cublasLtMatrixLayoutDestroy(Bdesc);
cublasLtMatrixLayoutDestroy(Cdesc);
sync_check_cuda_error();
}
else {
int cublasAlgo = info.algoId;
check_cuda_error(cublasGemmEx(cublas_handle_,
transa,
transb,
m,
n,
k,
alpha,
A,
Atype_,
lda,
B,
Btype_,
ldb,
beta,
C,
Ctype_,
ldc,
computeType_,
static_cast<cublasGemmAlgo_t>(cublasAlgo)));
sync_check_cuda_error();
}
mu_->unlock();
}
void cublasMMWrapper::setFP32GemmConfig()
{
Atype_ = CUDA_R_32F;
Btype_ = CUDA_R_32F;
Ctype_ = CUDA_R_32F;
computeType_ = CUDA_R_32F;
}
void cublasMMWrapper::setFP16GemmConfig()
{
Atype_ = CUDA_R_16F;
Btype_ = CUDA_R_16F;
Ctype_ = CUDA_R_16F;
computeType_ = CUDA_R_32F;
}
#ifdef ENABLE_BF16
void cublasMMWrapper::setBF16GemmConfig()
{
Atype_ = CUDA_R_16BF;
Btype_ = CUDA_R_16BF;
Ctype_ = CUDA_R_16BF;
computeType_ = CUDA_R_32F;
}
#endif
void cublasMMWrapper::setGemmConfig(cudaDataType_t aType,
cudaDataType_t bType,
cudaDataType_t cType,
cudaDataType_t computeType)
{
Atype_ = aType;
Btype_ = bType;
Ctype_ = cType;
computeType_ = computeType;
}
CublasDataType cublasMMWrapper::getCublasDataType(cudaDataType_t data_type)
{
if (data_type == CUDA_R_16F) {
return HALF_DATATYPE;
}
else if (data_type == CUDA_R_32F) {
return FLOAT_DATATYPE;
}
#ifdef ENABLE_BF16
else if (data_type == CUDA_R_16BF) {
return BFLOAT16_DATATYPE;
}
#endif
return FLOAT_DATATYPE;
}
#if (CUDART_VERSION >= 11000)
// input, weight, output are row-major
// only works for cublas 11.x
void cublasMMWrapper::Gemm(cublasOperation_t transa,
cublasOperation_t transb,
const int m,
const int n,
const int k,
const void* A,
const int lda,
const void* B,
const int ldb,
const void* bias,
void* C,
const int ldc)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
cudaDataType_t Atype, Btype, Ctype;
cublasComputeType_t computeType;
cudaDataType_t scaleType;
float alpha_float = 1.0f;
float beta_float = 0.0f;
half alpha_half = half(1.0f);
half beta_half = half(0.0f);
void * alpha, *beta;
// int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
if (Atype_ == CUDA_R_32F) {
computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
Atype = CUDA_R_32F;
Btype = CUDA_R_32F;
Ctype = CUDA_R_32F;
scaleType = CUDA_R_32F;
alpha = &alpha_float;
beta = &beta_float;
}
else if (Atype_ == CUDA_R_16BF) {
computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
Atype = CUDA_R_16BF;
Btype = CUDA_R_16BF;
Ctype = CUDA_R_16BF;
scaleType = CUDA_R_32F;
alpha = &alpha_float;
beta = &beta_float;
}
else {
computeType = CUBLAS_COMPUTE_16F;
Atype = CUDA_R_16F;
Btype = CUDA_R_16F;
Ctype = CUDA_R_16F;
scaleType = CUDA_R_16F;
alpha = &alpha_half;
beta = &beta_half;
}
cublasLtMatmulDesc_t operationDesc = NULL;
cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
cublasLtEpilogue_t epi = CUBLASLT_EPILOGUE_BIAS;
cublasLtMatrixLayoutCreate(&Adesc, Atype, (transa == CUBLAS_OP_N) ? m : k, (transa == CUBLAS_OP_N) ? k : m, lda);
cublasLtMatrixLayoutCreate(&Bdesc, Btype, (transb == CUBLAS_OP_N) ? k : n, (transb == CUBLAS_OP_N) ? n : k, ldb);
cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, ldc);
cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t));
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t));
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epi, sizeof(cublasLtEpilogue_t));
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(const void*));
check_cuda_error(cublasLtMatmul(
cublaslt_handle_, operationDesc, alpha, A, Adesc, B, Bdesc, beta, C, Cdesc, C, Cdesc, NULL, NULL, 0, stream_));
cublasLtMatrixLayoutDestroy(Adesc);
cublasLtMatrixLayoutDestroy(Bdesc);
cublasLtMatrixLayoutDestroy(Cdesc);
cublasLtMatmulDescDestroy(operationDesc);
}
#endif
void cublasMMWrapper::setStream(cudaStream_t stream)
{
stream_ = stream;
}
void cublasMMWrapper::stridedBatchedGemm(cublasOperation_t transa,
cublasOperation_t transb,
const int m,
const int n,
const int k,
const void* A,
const int lda,
const int64_t strideA,
const void* B,
const int ldb,
const int64_t strideB,
void* C,
const int ldc,
const int64_t strideC,
const int batch_count,
const float f_alpha,
const float f_beta)
{
half h_alpha = (half)f_alpha;
half h_beta = (half)f_beta;
mu_->lock();
int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
const void* alpha =
is_fp16_computeType ? reinterpret_cast<void*>(&h_alpha) : reinterpret_cast<const void*>(&f_alpha);
const void* beta = is_fp16_computeType ? reinterpret_cast<void*>(&h_beta) : reinterpret_cast<const void*>(&f_beta);
cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(Atype_));
check_cuda_error(cublasGemmStridedBatchedEx(cublas_handle_,
transa,
transb,
m,
n,
k,
alpha,
A,
Atype_,
lda,
strideA,
B,
Btype_,
ldb,
strideB,
beta,
C,
Ctype_,
ldc,
strideC,
batch_count,
computeType_,
static_cast<cublasGemmAlgo_t>(info.algoId)));
mu_->unlock();
}
void cublasMMWrapper::stridedBatchedGemm(cublasOperation_t transa,
cublasOperation_t transb,
const int m,
const int n,
const int k,
const float f_alpha,
const void* A,
cudaDataType_t AType,
const int lda,
const int64_t strideA,
const void* B,
cudaDataType_t BType,
const int ldb,
const int64_t strideB,
const float f_beta,
void* C,
cudaDataType_t CType,
const int ldc,
const int64_t strideC,
const int batch_count,
cudaDataType_t computeType)
{
half h_alpha = (half)f_alpha;
half h_beta = (half)f_beta;
mu_->lock();
int is_fp16_computeType = computeType == CUDA_R_16F ? 1 : 0;
const void* alpha =
is_fp16_computeType ? reinterpret_cast<void*>(&h_alpha) : reinterpret_cast<const void*>(&f_alpha);
const void* beta = is_fp16_computeType ? reinterpret_cast<void*>(&h_beta) : reinterpret_cast<const void*>(&f_beta);
cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(Atype_));
check_cuda_error(cublasGemmStridedBatchedEx(cublas_handle_,
transa,
transb,
m,
n,
k,
alpha,
A,
AType,
lda,
strideA,
B,
BType,
ldb,
strideB,
beta,
C,
CType,
ldc,
strideC,
batch_count,
computeType,
static_cast<cublasGemmAlgo_t>(info.algoId)));
mu_->unlock();
}
void cublasMMWrapper::batchedGemm(cublasOperation_t transa,
cublasOperation_t transb,
const int m,
const int n,
const int k,
const void* const* A,
const int lda,
const void* const* B,
const int ldb,
void* const* C,
const int ldc,
const int batch_count)
{
float f_alpha = static_cast<float>(1.0f);
float f_beta = static_cast<float>(0.0f);
half h_alpha = (half)1.0f;
half h_beta = (half)0.0f;
mu_->lock();
int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
const void* alpha = is_fp16_computeType ? reinterpret_cast<void*>(&h_alpha) : reinterpret_cast<void*>(&f_alpha);
const void* beta = is_fp16_computeType ? reinterpret_cast<void*>(&h_beta) : reinterpret_cast<void*>(&f_beta);
cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(Atype_));
check_cuda_error(cublasGemmBatchedEx(cublas_handle_,
transa,
transb,
m,
n,
k,
alpha,
A,
Atype_,
lda,
B,
Btype_,
ldb,
beta,
C,
Ctype_,
ldc,
batch_count,
computeType_,
static_cast<cublasGemmAlgo_t>(info.algoId)));
mu_->unlock();
}
bool cublasMMWrapper::isFuseBatchGemm(const int batch_count, const int m, const int k, const int n)
{
CublasDataType data_type = getCublasDataType(Atype_);
if (cublas_algo_map_->isExist(batch_count, m, k, n, data_type) == false
|| cublas_algo_map_->isExist(1, m, k, n, data_type) == false) {
return false;
}
else {
return cublas_algo_map_->getAlgo(batch_count, m, k, n, data_type).exec_time
< 3 * cublas_algo_map_->getAlgo(1, m, k, n, data_type).exec_time;
}
}
#ifdef SPARSITY_ENABLED
void cublasMMWrapper::SpGemm(cublasOperation_t transa,
cublasOperation_t transb,
const int m,
const int n,
const int k,
const void* A,
const void* B,
void* C)
{
if (Atype_ != CUDA_R_16F || Btype_ != CUDA_R_16F || Ctype_ != CUDA_R_16F) {
throw std::runtime_error("\n[FT][ERROR] sparse GEMM only supports FP16 data type now.");
}
static bool not_printed_fp32_accumulation_warning = true;
if (computeType_ != CUDA_R_16F && not_printed_fp32_accumulation_warning) {
printf("[FT][WARNING] cublasMMWrapper sets to FP32 compute type, "
"but sparse gemm will use FP16 compute type since cusparselt "
"supports FP16 accumulation only.\n");
not_printed_fp32_accumulation_warning = false;
}
cusparseOrder_t order = CUSPARSE_ORDER_COL;
cusparseOperation_t opA = (transa == CUBLAS_OP_N) ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
cusparseOperation_t opB = (transb == CUBLAS_OP_N) ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
cusparseComputeType compute_type = CUSPARSE_COMPUTE_16F;
cusparseLtMatmulDescriptor_t matmul;
cusparseLtMatmulAlgSelection_t alg_sel;
cusparseLtMatmulPlan_t plan;
bool is_rowmajor = (order == CUSPARSE_ORDER_ROW);
bool isA_transposed = (opA != CUSPARSE_OPERATION_NON_TRANSPOSE);
bool isB_transposed = (opB != CUSPARSE_OPERATION_NON_TRANSPOSE);
auto num_A_rows = (isA_transposed) ? k : m;
auto num_A_cols = (isA_transposed) ? m : k;
auto num_B_rows = (isB_transposed) ? n : k;
auto num_B_cols = (isB_transposed) ? k : n;
auto num_C_rows = m;
auto num_C_cols = n;
unsigned alignment = 16;
auto lda = (is_rowmajor) ? num_A_cols : num_A_rows;
auto ldb = (is_rowmajor) ? num_B_cols : num_B_rows;
auto ldc = (is_rowmajor) ? num_C_cols : num_C_rows;
float _alpha(1.0f);
float _beta(0.0f);
char mark[256];
sprintf(mark, "%d_%d_%d_%d", 1, m, n, k);
if (sp_mat_A_desc_map_.find(mark) != sp_mat_A_desc_map_.end()) {
CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_,
&matmul,
opA,
opB,
&sp_mat_A_desc_map_[mark],
&sp_mat_B_desc_map_[mark],
&sp_mat_C_desc_map_[mark],
&sp_mat_C_desc_map_[mark],
compute_type))
}
else {
// initializing MatDesc takes a lot of time
cusparseLtMatDescriptor_t matA, matB, matC;
sp_mat_A_desc_map_[mark] = matA;
sp_mat_B_desc_map_[mark] = matB;
sp_mat_C_desc_map_[mark] = matC;
CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_,
&sp_mat_A_desc_map_[mark],
num_A_rows,
num_A_cols,
lda,
alignment,
Atype_,
order,
CUSPARSELT_SPARSITY_50_PERCENT))
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(
&cusparselt_handle_, &sp_mat_B_desc_map_[mark], num_B_rows, num_B_cols, ldb, alignment, Btype_, order))
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(
&cusparselt_handle_, &sp_mat_C_desc_map_[mark], num_C_rows, num_C_cols, ldc, alignment, Ctype_, order))
CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_,
&matmul,
opA,
opB,
&sp_mat_A_desc_map_[mark],
&sp_mat_B_desc_map_[mark],
&sp_mat_C_desc_map_[mark],
&sp_mat_C_desc_map_[mark],
compute_type))
}
mu_->lock();
CHECK_CUSPARSE(
cusparseLtMatmulAlgSelectionInit(&cusparselt_handle_, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
int alg = cublas_algo_map_->getSpAlgo(1, num_A_rows, num_B_cols, num_A_cols);
CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
&cusparselt_handle_, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)))
size_t workspace_size;
CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&cusparselt_handle_, &alg_sel, &workspace_size))
CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&cusparselt_handle_, &plan, &matmul, &alg_sel, workspace_size))
void* d_workspace = nullptr;
int num_streams = 1;
cudaStream_t streams[1] = {stream_};
CHECK_CUSPARSE(
cusparseLtMatmul(&cusparselt_handle_, &plan, &_alpha, A, B, &_beta, C, C, d_workspace, streams, num_streams))
CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
sync_check_cuda_error();
mu_->unlock();
}
size_t cublasMMWrapper::getSparseMatrixSize(int m, int k)
{
// Get a compressed matrix size of shape (m, k) used in cusparselt.
auto Atype_ = CUDA_R_16F;
cusparseOrder_t order = CUSPARSE_ORDER_COL;
unsigned alignment = 16;
int num_A_rows = m;
int num_A_cols = k;
int lda = num_A_rows;
cusparseLtMatDescriptor_t matA;
CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_,
&matA,
num_A_rows,
num_A_cols,
lda,
alignment,
Atype_,
order,
CUSPARSELT_SPARSITY_50_PERCENT));
size_t compressed_size = 0;
CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&cusparselt_handle_, &matA, &compressed_size));
return compressed_size;
}
void cublasMMWrapper::compressMatrix(const void* input, void* output, const int m, const int k)
{
cusparseOrder_t order = CUSPARSE_ORDER_COL;
cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
cusparseLtMatDescriptor_t matA;
unsigned alignment = 16;
CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
&cusparselt_handle_, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
CHECK_CUSPARSE(cusparseLtSpMMACompress2(&cusparselt_handle_, &matA, true, opA, input, output, stream_))
sync_check_cuda_error();
}
bool cublasMMWrapper::isUseSparse(const int batch_count, const int m, const int n, const int k)
{
return cublas_algo_map_->isUseSparse(batch_count, m, n, k);
}
#endif
std::pair<bool, cublasLtMatmulAlgo_t> cublasMMWrapper::findBestAlgo(cublasLtHandle_t lightHandle,
cublasLtMatmulDesc_t computeDesc,
const void* alpha,
const void* A,
cublasLtMatrixLayout_t Adesc,
const void* B,
cublasLtMatrixLayout_t Bdesc,
const void* beta,
const void* C,
cublasLtMatrixLayout_t Cdesc,
void* D,
cublasLtMatrixLayout_t Ddesc,
cudaStream_t stream)
{
#if (CUBLAS_VERSION) <= 11601
FT_CHECK_WITH_INFO(false, "CUBLAS version too low.");
return {false, cublasLtMatmulAlgo_t{}};
#else
size_t returnSize;
int32_t pointer_mode;
cublasLtMatmulDescGetAttribute(
computeDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointer_mode, sizeof(pointer_mode), &returnSize);
std::vector<cublasLtMatmulHeuristicResult_t> heuristics(200);
cublasLtMatmulPreference_t preference;
check_cuda_error(cublasLtMatmulPreferenceCreate(&preference));
check_cuda_error(cublasLtMatmulPreferenceInit(preference));
uint64_t workspace_size = CUBLAS_WORKSPACE_SIZE;
check_cuda_error(cublasLtMatmulPreferenceSetAttribute(
preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_size, sizeof(workspace_size)));
#if (CUBLAS_VERSION) <= 12000
uint32_t pointer_mode_mask = 0;
check_cuda_error(cublasLtMatmulPreferenceSetAttribute(
preference, CUBLASLT_MATMUL_PREF_EPILOGUE_MASK, &pointer_mode_mask, sizeof(pointer_mode_mask)));
#endif
int return_count = 0;
auto ret = cublasLtMatmulAlgoGetHeuristic(lightHandle,
computeDesc,
Adesc,
Bdesc,
Cdesc,
Ddesc,
preference,
heuristics.size(),
heuristics.data(),
&return_count);
heuristics.resize(return_count);
std::map<int, std::vector<float>> algo_results;
for (const auto& heuristic : heuristics) {
cublasLtMatmulAlgo_t algo = heuristic.algo;
int32_t algo_id;
cublasLtMatmulAlgoConfigGetAttribute(&algo, CUBLASLT_ALGO_CONFIG_ID, &algo_id, sizeof(algo_id), &returnSize);
cudaEvent_t start_event, stop_event;
cudaEventCreate(&start_event);
cudaEventCreate(&stop_event);
float my_alpha = 1.0f;
float my_beta = 0.0f;
for (int i = 0; i < 11; i++) {
float duration_ms;
cudaEventRecord(start_event, stream);
check_cuda_error(cublasLtMatmul(lightHandle,
computeDesc,
alpha,
A,
Adesc,
B,
Bdesc,
beta,
C,
Cdesc,
D,
Ddesc,
&algo,
cublas_workspace_,
CUBLAS_WORKSPACE_SIZE,
stream));
cudaEventRecord(stop_event, stream);
cudaEventSynchronize(stop_event);
cudaEventElapsedTime(&duration_ms, start_event, stop_event);
algo_results[algo_id].push_back(duration_ms);
}
std::sort(algo_results[algo_id].begin(), algo_results[algo_id].end());
}
cublasLtMatmulHeuristicResult_t result;
float best_time = INFINITY;
for (const auto& heuristic : heuristics) {
cublasLtMatmulAlgo_t algo = heuristic.algo;
int32_t algo_id;
cublasLtMatmulAlgoConfigGetAttribute(&algo, CUBLASLT_ALGO_CONFIG_ID, &algo_id, sizeof(algo_id), &returnSize);
const auto& results = algo_results[algo_id];
if (results.size() > 0 && results[5] < best_time) {
best_time = results[5];
result = heuristic;
}
}
return {best_time != INFINITY, result.algo};
#endif
}
cublasMMWrapper::MatrixLayout cublasMMWrapper::createMatrixLayout(cublasLtMatrixLayout_t Mdesc)
{
size_t returnSize;
MatrixLayout m_layout;
cublasLtMatrixLayoutGetAttribute(
Mdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &std::get<0>(m_layout), sizeof(std::get<0>(m_layout)), &returnSize);
cublasLtMatrixLayoutGetAttribute(
Mdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &std::get<1>(m_layout), sizeof(std::get<1>(m_layout)), &returnSize);
cublasLtMatrixLayoutGetAttribute(
Mdesc, CUBLASLT_MATRIX_LAYOUT_ROWS, &std::get<2>(m_layout), sizeof(std::get<2>(m_layout)), &returnSize);
cublasLtMatrixLayoutGetAttribute(
Mdesc, CUBLASLT_MATRIX_LAYOUT_COLS, &std::get<3>(m_layout), sizeof(std::get<3>(m_layout)), &returnSize);
return m_layout;
}
cublasStatus_t cublasMMWrapper::cublasLtMatmulWrapper(cublasLtHandle_t lightHandle,
cublasLtMatmulDesc_t computeDesc,
const void* alpha,
const void* A,
cublasLtMatrixLayout_t Adesc,
const void* B,
cublasLtMatrixLayout_t Bdesc,
const void* beta,
const void* C,
cublasLtMatrixLayout_t Cdesc,
void* D,
cublasLtMatrixLayout_t Ddesc,
const cublasLtMatmulAlgo_t* algo,
void* workspace,
size_t workspaceSizeInBytes,
cudaStream_t stream)
{
cache_idx_t cache_idx{
computeDesc,
{createMatrixLayout(Adesc), createMatrixLayout(Bdesc), createMatrixLayout(Cdesc), createMatrixLayout(Ddesc)}};
cublasLtMatmulAlgo_t algo_value;
bool found_algo = false;
if (algo == nullptr) {
if (algo_cache.find(cache_idx) == algo_cache.end()) {
auto result =
findBestAlgo(lightHandle, computeDesc, alpha, A, Adesc, B, Bdesc, beta, C, Cdesc, D, Ddesc, stream);
if (result.first) {
algo_cache[cache_idx] = result.second;
algo_value = result.second;
found_algo = true;
}
}
else {
algo_value = algo_cache[cache_idx];
found_algo = true;
}
}
return cublasLtMatmul(lightHandle,
computeDesc,
alpha,
A,
Adesc,
B,
Bdesc,
beta,
C,
Cdesc,
D,
Ddesc,
found_algo ? &algo_value : algo,
workspace,
workspaceSizeInBytes,
stream);
}
void cublasMMWrapper::_Int8Gemm(const int m,
const int n,
const int k,
const int8_t* A,
const int lda,
const int8_t* B,
const int ldb,
void* C,
const int ldc,
const void* alpha,
const int mode,
const bool per_column_scaling)
{
/* mode:
* - 0: int8 * int8 -> int32 -> int8
* - 1: int8 * int8 -> int32 -> int32
*/
#if (CUBLAS_VERSION) <= 11601
FT_CHECK_WITH_INFO(false, "CUBLAS version too low.");
#else
mu_->lock();
const auto op_a = CUBLAS_OP_T;
const auto op_b = CUBLAS_OP_N;
const auto dataType = CUDA_R_8I;
const auto resultType = mode == 0 ? CUDA_R_8I : CUDA_R_32I;
const auto computeType = CUBLAS_COMPUTE_32I;
const auto scaleType = mode == 0 ? CUDA_R_32F : CUDA_R_32I;
const int batch_count = 1;
const void* beta;
int findAlgo = cublas_algo_map_->isExist(batch_count, m, n, k, getCublasDataType(dataType));
cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(dataType));
cublasLtMatmulDesc_t operationDesc = NULL;
cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
// --------------------------------------
// Create descriptors for the original matrices
check_cuda_error(cublasLtMatrixLayoutCreate(&Adesc, dataType, k, m, lda));
check_cuda_error(cublasLtMatrixLayoutCreate(&Bdesc, dataType, k, n, ldb));
check_cuda_error(cublasLtMatrixLayoutCreate(&Cdesc, resultType, m, n, ldc));
check_cuda_error(cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType));
auto pointer_mode = CUBLASLT_POINTER_MODE_HOST;
if (mode == 0) {
pointer_mode =
per_column_scaling ? CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST : CUBLASLT_POINTER_MODE_DEVICE;
}
check_cuda_error(
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &op_a, sizeof(cublasOperation_t)));
check_cuda_error(
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &op_b, sizeof(cublasOperation_t)));
check_cuda_error(
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSC, &op_b, sizeof(cublasOperation_t)));
check_cuda_error(cublasLtMatmulDescSetAttribute(
operationDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointer_mode, sizeof(pointer_mode)));
const int32_t int_one = 1;
const int32_t int_zero = 0;
const float float_zero = 0;
if (mode == 0) {
beta = per_column_scaling ? &float_zero : NULL;
}
else {
alpha = &int_one;
beta = &int_zero;
}
cublasLtMatmulAlgo_t algo;
void* workSpace = cublas_workspace_;
int workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
sync_check_cuda_error();
auto ret = cublasLtMatmulWrapper(cublaslt_handle_,
operationDesc,
alpha,
A,
Adesc,
B,
Bdesc,
beta,
C,
Cdesc,
C,
Cdesc,
NULL,
workSpace,
workspaceSize,
stream_);
check_cuda_error(ret);
sync_check_cuda_error();
cublasLtMatmulDescDestroy(operationDesc);
cublasLtMatrixLayoutDestroy(Adesc);
cublasLtMatrixLayoutDestroy(Bdesc);
cublasLtMatrixLayoutDestroy(Cdesc);
sync_check_cuda_error();
mu_->unlock();
#endif
}
void cublasMMWrapper::Int8Gemm(const int m,
const int n,
const int k,
const int8_t* A,
const int lda,
const int8_t* B,
const int ldb,
int8_t* C,
const int ldc,
const float* alpha,
const bool per_column_scaling)
{
return _Int8Gemm(m, n, k, A, lda, B, ldb, C, ldc, alpha, 0, per_column_scaling);
}
void cublasMMWrapper::Int8Gemm(const int m,
const int n,
const int k,
const int8_t* A,
const int lda,
const int8_t* B,
const int ldb,
int32_t* C,
const int ldc)
{
return _Int8Gemm(m, n, k, A, lda, B, ldb, C, ldc, (float*)nullptr, 1, false);
}
} // namespace fastertransformer
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment