Unverified Commit 4c9959f6 authored by Chen Xin's avatar Chen Xin Committed by GitHub
Browse files

Support windows platform (#209)

* __PRETTY_FUNCTION__

* CASE_K

* uint

* remove not

* HALF_FLT_MAX

* struct init

* port utils

* better build pthread-win32

* port kernels

* port utils/gemm_test

* hide windows header

* port models

* port examples && triton_backend && unittests

* update build readme

* fix lint

* fix lint

* fix lint

* fix lint

* fix lint

* fix build

* fix build

* cmake version

* fix typos

* update ci

* port kernels/gemm_s_f16

* update ci

* fix ci

* use cudaStreamSynchronize instead of volatile check

* remove gettimeofday

* remove pthread-win32

* remove dirent.h

* update pre-commit

* update

* remove todo

* fix include

* fix build

* fix build

* fix build ci

* fix github action trigger

* update README

* fix linux-build ci

* remove windows folder

* fix lint

* update readme
parent 0d21f366
......@@ -163,8 +163,8 @@ void loadWeights(LlamaDenseWeight<T>& w,
if (enable_slice) {
if (slice_dim == 1) {
size_t start = 0;
ConcateSlice slice0{.slices = {{0, 1}}};
ConcateSlice slice1{.slices = {{}}};
ConcateSlice slice0{{{0, 1}}};
ConcateSlice slice1{{{}}};
for (auto len : slice_shape) {
size_t stride = len / tensor_para_size;
slice1.slices.push_back({start + stride * rank, start + stride * (rank + 1)});
......@@ -181,8 +181,8 @@ void loadWeights(LlamaDenseWeight<T>& w,
if (enable_slice) {
if (slice_dim == 1) {
size_t start = 0;
ConcateSlice slice0{.slices = {{0, dim0}}};
ConcateSlice slice1{.slices = {{}}};
ConcateSlice slice0{{{0, dim0}}};
ConcateSlice slice1{{{}}};
for (auto len : slice_shape) {
size_t stride = len / tensor_para_size;
slice1.slices.push_back({start + stride * rank, start + stride * (rank + 1)});
......@@ -192,8 +192,8 @@ void loadWeights(LlamaDenseWeight<T>& w,
}
else {
size_t start = 0;
ConcateSlice slice0{.slices = {}};
ConcateSlice slice1{.slices = {{0, dim1}}};
ConcateSlice slice0{{}};
ConcateSlice slice1{{{0, dim1}}};
for (auto len : slice_shape) {
size_t stride = len / tensor_para_size;
slice0.slices.push_back({start + stride * rank, start + stride * (rank + 1)});
......
......@@ -19,6 +19,7 @@
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/attention_layers/DecoderSelfAttentionLayer.cc
#include "src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.h"
#include "src/turbomind/kernels/decoder_masked_multihead_attention.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/models/llama/LlamaNcclGuard.h"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/models/llama/llama_utils.h"
......
......@@ -23,6 +23,7 @@
#include "src/turbomind/models/llama/LlamaV2.h"
#include "src/turbomind/kernels/decoding_kernels.h"
#include "src/turbomind/kernels/gpt_kernels.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/models/llama/LlamaBatch.h"
#include "src/turbomind/models/llama/LlamaNcclGuard.h"
#include "src/turbomind/models/llama/LlamaWeight.h"
......
// Copyright (c) OpenMMLab. All rights reserved.
#include "src/turbomind/macro.h"
#include "src/turbomind/models/llama/llama_decoder_kernels.h"
#include "src/turbomind/utils/cuda_utils.h"
#include <cooperative_groups.h>
......
......@@ -2,6 +2,7 @@
#include "src/turbomind/kernels/decoder_masked_multihead_attention_utils.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/macro.h"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/models/llama/llama_utils.h"
#include "src/turbomind/utils/cuda_type_utils.cuh"
......@@ -111,7 +112,7 @@ template<typename T>
void invokeAddResidual(T* out, const T* in, int m, int n, cudaStream_t stream)
{
auto total = static_cast<size_t>(m) * n;
dim3 block(std::min(total, 1024UL));
dim3 block(std::min((unsigned long)total, 1024UL));
dim3 grid((total + block.x - 1) / block.x);
addResidual<<<grid, block, 0, stream>>>(out, in, total);
......
......@@ -38,7 +38,7 @@ DLDevice getDLDevice(triton::Tensor& tensor)
device_id = ptr_attr.device;
}
DLDevice device{.device_id = device_id};
DLDevice device{kDLCPU, device_id};
switch (tensor.where) {
case triton::MEMORY_CPU:
......@@ -60,7 +60,7 @@ std::unique_ptr<DLManagedTensor> TritonTensorToDLManagedTensor(triton::Tensor& t
{
DLDevice device = getDLDevice(tensor);
DLDataType data_type{.lanes = 1};
DLDataType data_type{0, 0, 1};
switch (tensor.type) {
case triton::TYPE_BOOL:
data_type.code = DLDataTypeCode::kDLBool;
......@@ -118,16 +118,15 @@ std::unique_ptr<DLManagedTensor> TritonTensorToDLManagedTensor(triton::Tensor& t
default:
break;
}
DLTensor dl_tensor{.data = const_cast<void*>(tensor.data),
.device = device,
.ndim = (int32_t)(tensor.shape.size()),
.dtype = data_type,
.shape = reinterpret_cast<int64_t*>(const_cast<size_t*>(tensor.shape.data())),
.strides = (int64_t*)(nullptr),
.byte_offset = 0};
DLTensor dl_tensor{const_cast<void*>(tensor.data),
device,
(int32_t)(tensor.shape.size()),
data_type,
reinterpret_cast<int64_t*>(const_cast<size_t*>(tensor.shape.data())),
(int64_t*)(nullptr),
0};
return std::unique_ptr<DLManagedTensor>(
new DLManagedTensor{.dl_tensor = dl_tensor, .manager_ctx = nullptr, .deleter = [](DLManagedTensor*) {}});
return std::unique_ptr<DLManagedTensor>(new DLManagedTensor{dl_tensor, nullptr, [](DLManagedTensor*) {}});
}
triton::MemoryType getMemoryType(DLDevice device)
......
......@@ -28,6 +28,18 @@ cmake_minimum_required (VERSION 3.18)
project(tritonturbomindbackend LANGUAGES C CXX)
add_library(TransformerTritonBackend STATIC transformer_triton_backend.cpp)
target_link_libraries(TransformerTritonBackend PUBLIC nccl_utils)
set_property(TARGET TransformerTritonBackend PROPERTY POSITION_INDEPENDENT_CODE ON)
install(TARGETS TransformerTritonBackend DESTINATION ${CMAKE_INSTALL_LIBDIR})
add_subdirectory(llama)
# Needn't build triton backend on windows
if (MSVC)
return ()
endif()
#
# Options
#
......@@ -266,27 +278,3 @@ export(
)
export(PACKAGE TritonTurboMindBackend)
# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
add_library(TransformerTritonBackend SHARED transformer_triton_backend.cpp)
target_link_libraries(TransformerTritonBackend PRIVATE nccl_utils)
set_target_properties(TransformerTritonBackend PROPERTIES
INSTALL_RPATH "../../nvidia/nccl/lib/"
)
install(TARGETS TransformerTritonBackend DESTINATION ${CMAKE_INSTALL_LIBDIR})
add_subdirectory(llama)
......@@ -36,7 +36,7 @@
#include <vector>
#pragma GCC diagnostic push
//#pragma GCC diagnostic ignored "-Wsign-compare"
// #pragma GCC diagnostic ignored "-Wsign-compare"
#pragma GCC diagnostic ignored "-Wcast-function-type"
#pragma warning(push, 0)
#pragma warning(pop)
......@@ -52,6 +52,7 @@
#include "triton/core/tritonbackend.h"
// FT's libraries have dependency with triton's lib
#include "src/turbomind/macro.h"
#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
#include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
......
......@@ -22,7 +22,8 @@ set(llama_triton_backend_files
LlamaTritonModelInstance.cc
)
find_package(CUDAToolkit REQUIRED)
add_library(LlamaTritonBackend STATIC ${llama_triton_backend_files})
set_property(TARGET LlamaTritonBackend PROPERTY POSITION_INDEPENDENT_CODE ON)
target_link_libraries(LlamaTritonBackend PRIVATE TransformerTritonBackend Llama tensor memory_utils -lcublasLt)
target_link_libraries(LlamaTritonBackend PUBLIC TransformerTritonBackend Llama tensor memory_utils CUDA::cublasLt)
target_compile_features(LlamaTritonBackend PRIVATE cxx_std_14)
......@@ -19,6 +19,7 @@
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
#include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
#include "src/turbomind/triton_backend/triton_utils.hpp"
#include "src/turbomind/utils/Tensor.h"
......
......@@ -23,7 +23,9 @@
#include <functional>
#include <memory>
#include <sstream>
#ifdef __linux__
#include <sys/time.h>
#endif
#include <vector>
#include "src/turbomind/utils/Tensor.h"
......
......@@ -14,29 +14,31 @@
cmake_minimum_required(VERSION 3.8)
find_package(CUDAToolkit REQUIRED)
add_subdirectory(gemm_test)
add_library(cuda_utils STATIC cuda_utils.cc)
set_property(TARGET cuda_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET cuda_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(cuda_utils PUBLIC -lcudart)
target_link_libraries(cuda_utils PUBLIC CUDA::cudart)
add_library(logger STATIC logger.cc)
set_property(TARGET logger PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET logger PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(logger PUBLIC -lcudart)
target_link_libraries(logger PUBLIC CUDA::cudart)
add_library(cublasAlgoMap STATIC cublasAlgoMap.cc)
set_property(TARGET cublasAlgoMap PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET cublasAlgoMap PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(cublasAlgoMap PUBLIC -lcublas -lcudart -lcurand cuda_utils logger)
target_link_libraries(cublasAlgoMap PUBLIC CUDA::cublas CUDA::cudart CUDA::curand cuda_utils logger)
add_library(cublasMMWrapper STATIC cublasMMWrapper.cc)
set_property(TARGET cublasMMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET cublasMMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(cublasMMWrapper PUBLIC -lcublas -lcudart -lcurand cublasAlgoMap cuda_utils logger)
target_link_libraries(cublasMMWrapper PUBLIC CUDA::cublas CUDA::cudart CUDA::curand cublasAlgoMap cuda_utils logger)
if (SPARSITY_SUPPORT)
target_link_libraries(cublasMMWrapper PUBLIC -lcusparse -lcusparseLt)
target_link_libraries(cublasMMWrapper PUBLIC CUDA::cusparse -lcusparseLt)
endif()
add_library(word_list STATIC word_list.cc)
......@@ -46,7 +48,11 @@ set_property(TARGET word_list PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(nvtx_utils STATIC nvtx_utils.cc)
set_property(TARGET nvtx_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET nvtx_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(nvtx_utils PUBLIC -lnvToolsExt)
if(${CMAKE_VERSION} VERSION_LESS "3.25")
target_link_libraries(nvtx_utils PUBLIC CUDA::nvToolsExt -ldl)
else()
target_link_libraries(nvtx_utils PUBLIC CUDA::nvtx3 -ldl)
endif()
add_library(memory_utils STATIC memory_utils.cu)
set_property(TARGET memory_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
......@@ -70,13 +76,13 @@ endif()
add_library(cublasINT8MMWrapper STATIC cublasINT8MMWrapper.cc)
set_property(TARGET cublasINT8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET cublasINT8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(cublasINT8MMWrapper PUBLIC -lcublasLt -lcudart -lcurand cublasAlgoMap cublasMMWrapper cuda_utils logger)
target_link_libraries(cublasINT8MMWrapper PUBLIC CUDA::cublasLt CUDA::cudart CUDA::curand cublasAlgoMap cublasMMWrapper cuda_utils logger)
if(ENABLE_FP8)
add_library(cublasFP8MMWrapper STATIC cublasFP8MMWrapper.cu)
set_property(TARGET cublasFP8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET cublasFP8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(cublasFP8MMWrapper PUBLIC -lcublasLt -lcudart -lcurand
target_link_libraries(cublasFP8MMWrapper PUBLIC CUDA::cublasLt CUDA::cudart CUDA::curand
cublasAlgoMap cublasMMWrapper nvtx_utils fp8_qgmma_1x1_utils)
endif()
......@@ -89,10 +95,10 @@ add_library(gemm STATIC gemm.cc)
set_property(TARGET gemm PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET gemm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(gemm PUBLIC
-lcublas -lcublasLt -lcudart -lcurand
CUDA::cublas CUDA::cublasLt CUDA::cudart CUDA::curand
cublasAlgoMap memory_utils cuda_utils logger)
if (SPARSITY_SUPPORT)
target_link_libraries(gemm PUBLIC -lcusparse -lcusparseLt)
target_link_libraries(gemm PUBLIC CUDA::cusparse -lcusparseLt)
endif()
add_library(cuda_fp8_utils STATIC cuda_fp8_utils.cu)
......
......@@ -22,7 +22,7 @@
#include "stdlib.h"
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <dirent.h>
#include <filesystem>
#include <numeric>
#include <stdlib.h>
#include <string>
......@@ -31,6 +31,7 @@
#include <unordered_map>
#include <vector>
namespace fs = std::filesystem;
namespace turbomind {
Tensor::Tensor():
......@@ -44,7 +45,7 @@ Tensor::Tensor():
}
Tensor::Tensor(const MemoryType _where, const DataType _type, const std::vector<size_t> _shape, const void* _data):
where(_where), type(_type), shape(_shape), data(_data)
where(_where), type(_type), shape(_shape), data(const_cast<void*>(_data))
{
}
......@@ -53,7 +54,7 @@ Tensor::Tensor(const MemoryType _where,
const std::vector<size_t> _shape,
const void* _data,
const std::vector<size_t> _offset):
where(_where), type(_type), shape(_shape), data(_data), offsets(_offset)
where(_where), type(_type), shape(_shape), data(const_cast<void*>(_data)), offsets(_offset)
{
}
......@@ -407,14 +408,10 @@ std::string TensorMap::toString()
TensorMap TensorMap::fromNpyFolder(const std::string& base_folder)
{
DIR* dir_p = opendir(base_folder.c_str());
FT_CHECK_WITH_INFO(dir_p != nullptr, fmtstr("Could not open folder %s. ", base_folder.c_str()));
struct dirent* dp;
TensorMap ret_tensor;
while ((dp = readdir(dir_p)) != nullptr) {
std::string filename(dp->d_name);
size_t len = filename.length();
for (auto const& entry : fs::directory_iterator{base_folder}) {
std::string filename = entry.path().stem().string();
size_t len = filename.length();
if (len < 4 || filename.compare(len - 4, 4, ".npy")) {
continue;
}
......@@ -439,18 +436,13 @@ TensorMap TensorMap::fromNpyFolder(const std::string& base_folder)
ret_tensor.tensor_map_.insert({key, Tensor::loadNpy(base_folder + "/" + filename, where)});
}
closedir(dir_p);
return ret_tensor;
}
void TensorMap::saveNpy(const std::string& base_folder)
{
mode_t mode_0755 = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH;
int ret = mkdir(base_folder.c_str(), mode_0755);
FT_CHECK_WITH_INFO(ret == 0 || errno == EEXIST, fmtstr("Could not create folder %s.\n", base_folder.c_str()));
bool ret = fs::exists(base_folder) | fs::create_directory(base_folder);
FT_CHECK_WITH_INFO(ret == true, fmtstr("Could not create folder %s.\n", base_folder.c_str()));
for (const auto& item : tensor_map_) {
item.second.saveNpy(base_folder + "/" + item.second.whereToString() + "-" + item.first + ".npy");
}
......
......@@ -16,6 +16,7 @@
#pragma once
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/cuda_fp8_utils.h"
#include "src/turbomind/utils/cuda_utils.h"
......@@ -24,7 +25,6 @@
#include "stdlib.h"
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <dirent.h>
#include <numeric>
#include <stdlib.h>
#include <string>
......@@ -107,11 +107,11 @@ typedef enum memorytype_enum
} MemoryType;
struct Tensor {
const MemoryType where;
const DataType type;
const std::vector<size_t> shape;
const void* data; // TODO(bhseuh) modify from const void* to void* const
const std::vector<size_t> offsets = std::vector<size_t>{};
MemoryType where;
DataType type;
std::vector<size_t> shape;
void* data;
std::vector<size_t> offsets = std::vector<size_t>{};
Tensor();
Tensor(const MemoryType _where, const DataType _type, const std::vector<size_t> _shape, const void* _data);
......
......@@ -20,6 +20,7 @@
#pragma once
#include "cuda_utils.h"
#include "src/turbomind/macro.h"
#include <cuda_runtime.h>
#include <unordered_map>
#include <vector>
......
......@@ -16,6 +16,7 @@
#include "cublasFP8MMWrapper.h"
#include "cuda_utils.h"
#include "src/turbomind/macro.h"
namespace turbomind {
......
......@@ -16,6 +16,7 @@
#include "cublasMMWrapper.h"
#include "cuda_utils.h"
#include "src/turbomind/macro.h"
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
......@@ -803,13 +804,13 @@ std::pair<bool, cublasLtMatmulAlgo_t> cublasMMWrapper::findBestAlgo(cublasLtHand
FT_CHECK_WITH_INFO(false, "CUBLAS version too low.");
return {false, cublasLtMatmulAlgo_t{}};
#else
size_t returnSize;
size_t returnSize;
int32_t pointer_mode;
cublasLtMatmulDescGetAttribute(
computeDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointer_mode, sizeof(pointer_mode), &returnSize);
std::vector<cublasLtMatmulHeuristicResult_t> heuristics(200);
cublasLtMatmulPreference_t preference;
cublasLtMatmulPreference_t preference;
check_cuda_error(cublasLtMatmulPreferenceCreate(&preference));
check_cuda_error(cublasLtMatmulPreferenceInit(preference));
uint64_t workspace_size = CUBLAS_WORKSPACE_SIZE;
......@@ -821,8 +822,8 @@ std::pair<bool, cublasLtMatmulAlgo_t> cublasMMWrapper::findBestAlgo(cublasLtHand
preference, CUBLASLT_MATMUL_PREF_EPILOGUE_MASK, &pointer_mode_mask, sizeof(pointer_mode_mask)));
#endif
int return_count = 0;
auto ret = cublasLtMatmulAlgoGetHeuristic(lightHandle,
int return_count = 0;
auto ret = cublasLtMatmulAlgoGetHeuristic(lightHandle,
computeDesc,
Adesc,
Bdesc,
......@@ -837,7 +838,7 @@ std::pair<bool, cublasLtMatmulAlgo_t> cublasMMWrapper::findBestAlgo(cublasLtHand
std::map<int, std::vector<float>> algo_results;
for (const auto& heuristic : heuristics) {
cublasLtMatmulAlgo_t algo = heuristic.algo;
int32_t algo_id;
int32_t algo_id;
cublasLtMatmulAlgoConfigGetAttribute(&algo, CUBLASLT_ALGO_CONFIG_ID, &algo_id, sizeof(algo_id), &returnSize);
cudaEvent_t start_event, stop_event;
......@@ -845,7 +846,7 @@ std::pair<bool, cublasLtMatmulAlgo_t> cublasMMWrapper::findBestAlgo(cublasLtHand
cudaEventCreate(&stop_event);
float my_alpha = 1.0f;
float my_beta = 0.0f;
float my_beta = 0.0f;
for (int i = 0; i < 11; i++) {
float duration_ms;
......@@ -876,16 +877,16 @@ std::pair<bool, cublasLtMatmulAlgo_t> cublasMMWrapper::findBestAlgo(cublasLtHand
}
cublasLtMatmulHeuristicResult_t result;
float best_time = INFINITY;
float best_time = INFINITY;
for (const auto& heuristic : heuristics) {
cublasLtMatmulAlgo_t algo = heuristic.algo;
int32_t algo_id;
int32_t algo_id;
cublasLtMatmulAlgoConfigGetAttribute(&algo, CUBLASLT_ALGO_CONFIG_ID, &algo_id, sizeof(algo_id), &returnSize);
const auto& results = algo_results[algo_id];
if (results.size() > 0 && results[5] < best_time) {
best_time = results[5];
result = heuristic;
result = heuristic;
}
}
......@@ -989,20 +990,20 @@ void cublasMMWrapper::_Int8Gemm(const int m,
#else
mu_->lock();
const auto op_a = CUBLAS_OP_T;
const auto op_b = CUBLAS_OP_N;
const auto dataType = CUDA_R_8I;
const auto resultType = mode == 0 ? CUDA_R_8I : CUDA_R_32I;
const auto computeType = CUBLAS_COMPUTE_32I;
const auto scaleType = mode == 0 ? CUDA_R_32F : CUDA_R_32I;
const int batch_count = 1;
const auto op_a = CUBLAS_OP_T;
const auto op_b = CUBLAS_OP_N;
const auto dataType = CUDA_R_8I;
const auto resultType = mode == 0 ? CUDA_R_8I : CUDA_R_32I;
const auto computeType = CUBLAS_COMPUTE_32I;
const auto scaleType = mode == 0 ? CUDA_R_32F : CUDA_R_32I;
const int batch_count = 1;
const void* beta;
int findAlgo = cublas_algo_map_->isExist(batch_count, m, n, k, getCublasDataType(dataType));
cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(dataType));
cublasLtMatmulDesc_t operationDesc = NULL;
cublasLtMatmulDesc_t operationDesc = NULL;
cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
// --------------------------------------
......@@ -1027,20 +1028,20 @@ void cublasMMWrapper::_Int8Gemm(const int m,
check_cuda_error(cublasLtMatmulDescSetAttribute(
operationDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointer_mode, sizeof(pointer_mode)));
const int32_t int_one = 1;
const int32_t int_zero = 0;
const float float_zero = 0;
const int32_t int_one = 1;
const int32_t int_zero = 0;
const float float_zero = 0;
if (mode == 0) {
beta = per_column_scaling ? &float_zero : NULL;
}
else {
alpha = &int_one;
beta = &int_zero;
beta = &int_zero;
}
cublasLtMatmulAlgo_t algo;
void* workSpace = cublas_workspace_;
int workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
void* workSpace = cublas_workspace_;
int workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
sync_check_cuda_error();
auto ret = cublasLtMatmulWrapper(cublaslt_handle_,
......
......@@ -17,6 +17,7 @@
#include "cuda_utils.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cublasAlgoMap.h"
#include <array>
#include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
......
......@@ -15,6 +15,7 @@
*/
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/cuda_fp8_utils.h"
namespace turbomind {
......@@ -250,6 +251,8 @@ void printMatrix(int* ptr, int m, int k, int stride, bool is_device_ptr)
}
}
// multiple definitions for msvc
#ifndef _MSC_VER
void printMatrix(size_t* ptr, int m, int k, int stride, bool is_device_ptr)
{
typedef size_t T;
......@@ -286,6 +289,7 @@ void printMatrix(size_t* ptr, int m, int k, int stride, bool is_device_ptr)
free(tmp);
}
}
#endif
template<typename T>
void check_max_val(const T* result, const int size)
......
......@@ -17,6 +17,7 @@
#pragma once
#include "3rdparty/INIReader.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/logger.h"
......@@ -264,11 +265,6 @@ public:
~CudaTimer() {}
};
static double diffTime(timeval start, timeval end)
{
return (end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001;
}
/* ***************************** common utils ****************************** */
inline void print_mem_usage(std::string time = "after allocation")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment