Unverified Commit 4c9959f6 authored by Chen Xin's avatar Chen Xin Committed by GitHub
Browse files

Support windows platform (#209)

* __PRETTY_FUNCTION__

* CASE_K

* uint

* remove not

* HALF_FLT_MAX

* struct init

* port utils

* better build pthread-win32

* port kernels

* port utils/gemm_test

* hide windows header

* port models

* port examples && triton_backend && unittests

* update build readme

* fix lint

* fix lint

* fix lint

* fix lint

* fix lint

* fix build

* fix build

* cmake version

* fix typos

* update ci

* port kernels/gemm_s_f16

* update ci

* fix ci

* use cudaStreamSynchronize instead of volatile check

* remove gettimeofday

* remove pthread-win32

* remove dirent.h

* update pre-commit

* update

* remove todo

* fix include

* fix build

* fix build

* fix build ci

* fix github action trigger

* update README

* fix linux-build ci

* remove windows folder

* fix lint

* update readme
parent 0d21f366
...@@ -163,8 +163,8 @@ void loadWeights(LlamaDenseWeight<T>& w, ...@@ -163,8 +163,8 @@ void loadWeights(LlamaDenseWeight<T>& w,
if (enable_slice) { if (enable_slice) {
if (slice_dim == 1) { if (slice_dim == 1) {
size_t start = 0; size_t start = 0;
ConcateSlice slice0{.slices = {{0, 1}}}; ConcateSlice slice0{{{0, 1}}};
ConcateSlice slice1{.slices = {{}}}; ConcateSlice slice1{{{}}};
for (auto len : slice_shape) { for (auto len : slice_shape) {
size_t stride = len / tensor_para_size; size_t stride = len / tensor_para_size;
slice1.slices.push_back({start + stride * rank, start + stride * (rank + 1)}); slice1.slices.push_back({start + stride * rank, start + stride * (rank + 1)});
...@@ -181,8 +181,8 @@ void loadWeights(LlamaDenseWeight<T>& w, ...@@ -181,8 +181,8 @@ void loadWeights(LlamaDenseWeight<T>& w,
if (enable_slice) { if (enable_slice) {
if (slice_dim == 1) { if (slice_dim == 1) {
size_t start = 0; size_t start = 0;
ConcateSlice slice0{.slices = {{0, dim0}}}; ConcateSlice slice0{{{0, dim0}}};
ConcateSlice slice1{.slices = {{}}}; ConcateSlice slice1{{{}}};
for (auto len : slice_shape) { for (auto len : slice_shape) {
size_t stride = len / tensor_para_size; size_t stride = len / tensor_para_size;
slice1.slices.push_back({start + stride * rank, start + stride * (rank + 1)}); slice1.slices.push_back({start + stride * rank, start + stride * (rank + 1)});
...@@ -192,8 +192,8 @@ void loadWeights(LlamaDenseWeight<T>& w, ...@@ -192,8 +192,8 @@ void loadWeights(LlamaDenseWeight<T>& w,
} }
else { else {
size_t start = 0; size_t start = 0;
ConcateSlice slice0{.slices = {}}; ConcateSlice slice0{{}};
ConcateSlice slice1{.slices = {{0, dim1}}}; ConcateSlice slice1{{{0, dim1}}};
for (auto len : slice_shape) { for (auto len : slice_shape) {
size_t stride = len / tensor_para_size; size_t stride = len / tensor_para_size;
slice0.slices.push_back({start + stride * rank, start + stride * (rank + 1)}); slice0.slices.push_back({start + stride * rank, start + stride * (rank + 1)});
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/attention_layers/DecoderSelfAttentionLayer.cc // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/attention_layers/DecoderSelfAttentionLayer.cc
#include "src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.h" #include "src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.h"
#include "src/turbomind/kernels/decoder_masked_multihead_attention.h" #include "src/turbomind/kernels/decoder_masked_multihead_attention.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/models/llama/LlamaNcclGuard.h" #include "src/turbomind/models/llama/LlamaNcclGuard.h"
#include "src/turbomind/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/models/llama/llama_utils.h" #include "src/turbomind/models/llama/llama_utils.h"
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "src/turbomind/models/llama/LlamaV2.h" #include "src/turbomind/models/llama/LlamaV2.h"
#include "src/turbomind/kernels/decoding_kernels.h" #include "src/turbomind/kernels/decoding_kernels.h"
#include "src/turbomind/kernels/gpt_kernels.h" #include "src/turbomind/kernels/gpt_kernels.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/models/llama/LlamaBatch.h" #include "src/turbomind/models/llama/LlamaBatch.h"
#include "src/turbomind/models/llama/LlamaNcclGuard.h" #include "src/turbomind/models/llama/LlamaNcclGuard.h"
#include "src/turbomind/models/llama/LlamaWeight.h" #include "src/turbomind/models/llama/LlamaWeight.h"
......
// Copyright (c) OpenMMLab. All rights reserved. // Copyright (c) OpenMMLab. All rights reserved.
#include "src/turbomind/macro.h"
#include "src/turbomind/models/llama/llama_decoder_kernels.h" #include "src/turbomind/models/llama/llama_decoder_kernels.h"
#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include <cooperative_groups.h> #include <cooperative_groups.h>
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#include "src/turbomind/kernels/decoder_masked_multihead_attention_utils.h" #include "src/turbomind/kernels/decoder_masked_multihead_attention_utils.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh" #include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/macro.h"
#include "src/turbomind/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/models/llama/llama_utils.h" #include "src/turbomind/models/llama/llama_utils.h"
#include "src/turbomind/utils/cuda_type_utils.cuh" #include "src/turbomind/utils/cuda_type_utils.cuh"
...@@ -111,7 +112,7 @@ template<typename T> ...@@ -111,7 +112,7 @@ template<typename T>
void invokeAddResidual(T* out, const T* in, int m, int n, cudaStream_t stream) void invokeAddResidual(T* out, const T* in, int m, int n, cudaStream_t stream)
{ {
auto total = static_cast<size_t>(m) * n; auto total = static_cast<size_t>(m) * n;
dim3 block(std::min(total, 1024UL)); dim3 block(std::min((unsigned long)total, 1024UL));
dim3 grid((total + block.x - 1) / block.x); dim3 grid((total + block.x - 1) / block.x);
addResidual<<<grid, block, 0, stream>>>(out, in, total); addResidual<<<grid, block, 0, stream>>>(out, in, total);
......
...@@ -38,7 +38,7 @@ DLDevice getDLDevice(triton::Tensor& tensor) ...@@ -38,7 +38,7 @@ DLDevice getDLDevice(triton::Tensor& tensor)
device_id = ptr_attr.device; device_id = ptr_attr.device;
} }
DLDevice device{.device_id = device_id}; DLDevice device{kDLCPU, device_id};
switch (tensor.where) { switch (tensor.where) {
case triton::MEMORY_CPU: case triton::MEMORY_CPU:
...@@ -60,7 +60,7 @@ std::unique_ptr<DLManagedTensor> TritonTensorToDLManagedTensor(triton::Tensor& t ...@@ -60,7 +60,7 @@ std::unique_ptr<DLManagedTensor> TritonTensorToDLManagedTensor(triton::Tensor& t
{ {
DLDevice device = getDLDevice(tensor); DLDevice device = getDLDevice(tensor);
DLDataType data_type{.lanes = 1}; DLDataType data_type{0, 0, 1};
switch (tensor.type) { switch (tensor.type) {
case triton::TYPE_BOOL: case triton::TYPE_BOOL:
data_type.code = DLDataTypeCode::kDLBool; data_type.code = DLDataTypeCode::kDLBool;
...@@ -118,16 +118,15 @@ std::unique_ptr<DLManagedTensor> TritonTensorToDLManagedTensor(triton::Tensor& t ...@@ -118,16 +118,15 @@ std::unique_ptr<DLManagedTensor> TritonTensorToDLManagedTensor(triton::Tensor& t
default: default:
break; break;
} }
DLTensor dl_tensor{.data = const_cast<void*>(tensor.data), DLTensor dl_tensor{const_cast<void*>(tensor.data),
.device = device, device,
.ndim = (int32_t)(tensor.shape.size()), (int32_t)(tensor.shape.size()),
.dtype = data_type, data_type,
.shape = reinterpret_cast<int64_t*>(const_cast<size_t*>(tensor.shape.data())), reinterpret_cast<int64_t*>(const_cast<size_t*>(tensor.shape.data())),
.strides = (int64_t*)(nullptr), (int64_t*)(nullptr),
.byte_offset = 0}; 0};
return std::unique_ptr<DLManagedTensor>( return std::unique_ptr<DLManagedTensor>(new DLManagedTensor{dl_tensor, nullptr, [](DLManagedTensor*) {}});
new DLManagedTensor{.dl_tensor = dl_tensor, .manager_ctx = nullptr, .deleter = [](DLManagedTensor*) {}});
} }
triton::MemoryType getMemoryType(DLDevice device) triton::MemoryType getMemoryType(DLDevice device)
......
...@@ -28,6 +28,18 @@ cmake_minimum_required (VERSION 3.18) ...@@ -28,6 +28,18 @@ cmake_minimum_required (VERSION 3.18)
project(tritonturbomindbackend LANGUAGES C CXX) project(tritonturbomindbackend LANGUAGES C CXX)
add_library(TransformerTritonBackend STATIC transformer_triton_backend.cpp)
target_link_libraries(TransformerTritonBackend PUBLIC nccl_utils)
set_property(TARGET TransformerTritonBackend PROPERTY POSITION_INDEPENDENT_CODE ON)
install(TARGETS TransformerTritonBackend DESTINATION ${CMAKE_INSTALL_LIBDIR})
add_subdirectory(llama)
# Needn't build triton backend on windows
if (MSVC)
return ()
endif()
# #
# Options # Options
# #
...@@ -266,27 +278,3 @@ export( ...@@ -266,27 +278,3 @@ export(
) )
export(PACKAGE TritonTurboMindBackend) export(PACKAGE TritonTurboMindBackend)
# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
add_library(TransformerTritonBackend SHARED transformer_triton_backend.cpp)
target_link_libraries(TransformerTritonBackend PRIVATE nccl_utils)
set_target_properties(TransformerTritonBackend PROPERTIES
INSTALL_RPATH "../../nvidia/nccl/lib/"
)
install(TARGETS TransformerTritonBackend DESTINATION ${CMAKE_INSTALL_LIBDIR})
add_subdirectory(llama)
...@@ -36,7 +36,7 @@ ...@@ -36,7 +36,7 @@
#include <vector> #include <vector>
#pragma GCC diagnostic push #pragma GCC diagnostic push
//#pragma GCC diagnostic ignored "-Wsign-compare" // #pragma GCC diagnostic ignored "-Wsign-compare"
#pragma GCC diagnostic ignored "-Wcast-function-type" #pragma GCC diagnostic ignored "-Wcast-function-type"
#pragma warning(push, 0) #pragma warning(push, 0)
#pragma warning(pop) #pragma warning(pop)
...@@ -52,6 +52,7 @@ ...@@ -52,6 +52,7 @@
#include "triton/core/tritonbackend.h" #include "triton/core/tritonbackend.h"
// FT's libraries have dependency with triton's lib // FT's libraries have dependency with triton's lib
#include "src/turbomind/macro.h"
#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h" #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
#include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h" #include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/turbomind/triton_backend/transformer_triton_backend.hpp" #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
......
...@@ -22,7 +22,8 @@ set(llama_triton_backend_files ...@@ -22,7 +22,8 @@ set(llama_triton_backend_files
LlamaTritonModelInstance.cc LlamaTritonModelInstance.cc
) )
find_package(CUDAToolkit REQUIRED)
add_library(LlamaTritonBackend STATIC ${llama_triton_backend_files}) add_library(LlamaTritonBackend STATIC ${llama_triton_backend_files})
set_property(TARGET LlamaTritonBackend PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET LlamaTritonBackend PROPERTY POSITION_INDEPENDENT_CODE ON)
target_link_libraries(LlamaTritonBackend PRIVATE TransformerTritonBackend Llama tensor memory_utils -lcublasLt) target_link_libraries(LlamaTritonBackend PUBLIC TransformerTritonBackend Llama tensor memory_utils CUDA::cublasLt)
target_compile_features(LlamaTritonBackend PRIVATE cxx_std_14) target_compile_features(LlamaTritonBackend PRIVATE cxx_std_14)
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
#include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h" #include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/triton_backend/transformer_triton_backend.hpp" #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
#include "src/turbomind/triton_backend/triton_utils.hpp" #include "src/turbomind/triton_backend/triton_utils.hpp"
#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
......
...@@ -23,7 +23,9 @@ ...@@ -23,7 +23,9 @@
#include <functional> #include <functional>
#include <memory> #include <memory>
#include <sstream> #include <sstream>
#ifdef __linux__
#include <sys/time.h> #include <sys/time.h>
#endif
#include <vector> #include <vector>
#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
......
...@@ -14,29 +14,31 @@ ...@@ -14,29 +14,31 @@
cmake_minimum_required(VERSION 3.8) cmake_minimum_required(VERSION 3.8)
find_package(CUDAToolkit REQUIRED)
add_subdirectory(gemm_test) add_subdirectory(gemm_test)
add_library(cuda_utils STATIC cuda_utils.cc) add_library(cuda_utils STATIC cuda_utils.cc)
set_property(TARGET cuda_utils PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET cuda_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET cuda_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_property(TARGET cuda_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(cuda_utils PUBLIC -lcudart) target_link_libraries(cuda_utils PUBLIC CUDA::cudart)
add_library(logger STATIC logger.cc) add_library(logger STATIC logger.cc)
set_property(TARGET logger PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET logger PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET logger PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_property(TARGET logger PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(logger PUBLIC -lcudart) target_link_libraries(logger PUBLIC CUDA::cudart)
add_library(cublasAlgoMap STATIC cublasAlgoMap.cc) add_library(cublasAlgoMap STATIC cublasAlgoMap.cc)
set_property(TARGET cublasAlgoMap PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET cublasAlgoMap PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET cublasAlgoMap PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_property(TARGET cublasAlgoMap PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(cublasAlgoMap PUBLIC -lcublas -lcudart -lcurand cuda_utils logger) target_link_libraries(cublasAlgoMap PUBLIC CUDA::cublas CUDA::cudart CUDA::curand cuda_utils logger)
add_library(cublasMMWrapper STATIC cublasMMWrapper.cc) add_library(cublasMMWrapper STATIC cublasMMWrapper.cc)
set_property(TARGET cublasMMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET cublasMMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET cublasMMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_property(TARGET cublasMMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(cublasMMWrapper PUBLIC -lcublas -lcudart -lcurand cublasAlgoMap cuda_utils logger) target_link_libraries(cublasMMWrapper PUBLIC CUDA::cublas CUDA::cudart CUDA::curand cublasAlgoMap cuda_utils logger)
if (SPARSITY_SUPPORT) if (SPARSITY_SUPPORT)
target_link_libraries(cublasMMWrapper PUBLIC -lcusparse -lcusparseLt) target_link_libraries(cublasMMWrapper PUBLIC CUDA::cusparse -lcusparseLt)
endif() endif()
add_library(word_list STATIC word_list.cc) add_library(word_list STATIC word_list.cc)
...@@ -46,7 +48,11 @@ set_property(TARGET word_list PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) ...@@ -46,7 +48,11 @@ set_property(TARGET word_list PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(nvtx_utils STATIC nvtx_utils.cc) add_library(nvtx_utils STATIC nvtx_utils.cc)
set_property(TARGET nvtx_utils PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET nvtx_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET nvtx_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_property(TARGET nvtx_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(nvtx_utils PUBLIC -lnvToolsExt) if(${CMAKE_VERSION} VERSION_LESS "3.25")
target_link_libraries(nvtx_utils PUBLIC CUDA::nvToolsExt -ldl)
else()
target_link_libraries(nvtx_utils PUBLIC CUDA::nvtx3 -ldl)
endif()
add_library(memory_utils STATIC memory_utils.cu) add_library(memory_utils STATIC memory_utils.cu)
set_property(TARGET memory_utils PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET memory_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
...@@ -70,13 +76,13 @@ endif() ...@@ -70,13 +76,13 @@ endif()
add_library(cublasINT8MMWrapper STATIC cublasINT8MMWrapper.cc) add_library(cublasINT8MMWrapper STATIC cublasINT8MMWrapper.cc)
set_property(TARGET cublasINT8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET cublasINT8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET cublasINT8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_property(TARGET cublasINT8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(cublasINT8MMWrapper PUBLIC -lcublasLt -lcudart -lcurand cublasAlgoMap cublasMMWrapper cuda_utils logger) target_link_libraries(cublasINT8MMWrapper PUBLIC CUDA::cublasLt CUDA::cudart CUDA::curand cublasAlgoMap cublasMMWrapper cuda_utils logger)
if(ENABLE_FP8) if(ENABLE_FP8)
add_library(cublasFP8MMWrapper STATIC cublasFP8MMWrapper.cu) add_library(cublasFP8MMWrapper STATIC cublasFP8MMWrapper.cu)
set_property(TARGET cublasFP8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET cublasFP8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET cublasFP8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_property(TARGET cublasFP8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(cublasFP8MMWrapper PUBLIC -lcublasLt -lcudart -lcurand target_link_libraries(cublasFP8MMWrapper PUBLIC CUDA::cublasLt CUDA::cudart CUDA::curand
cublasAlgoMap cublasMMWrapper nvtx_utils fp8_qgmma_1x1_utils) cublasAlgoMap cublasMMWrapper nvtx_utils fp8_qgmma_1x1_utils)
endif() endif()
...@@ -89,10 +95,10 @@ add_library(gemm STATIC gemm.cc) ...@@ -89,10 +95,10 @@ add_library(gemm STATIC gemm.cc)
set_property(TARGET gemm PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET gemm PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET gemm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_property(TARGET gemm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(gemm PUBLIC target_link_libraries(gemm PUBLIC
-lcublas -lcublasLt -lcudart -lcurand CUDA::cublas CUDA::cublasLt CUDA::cudart CUDA::curand
cublasAlgoMap memory_utils cuda_utils logger) cublasAlgoMap memory_utils cuda_utils logger)
if (SPARSITY_SUPPORT) if (SPARSITY_SUPPORT)
target_link_libraries(gemm PUBLIC -lcusparse -lcusparseLt) target_link_libraries(gemm PUBLIC CUDA::cusparse -lcusparseLt)
endif() endif()
add_library(cuda_fp8_utils STATIC cuda_fp8_utils.cu) add_library(cuda_fp8_utils STATIC cuda_fp8_utils.cu)
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#include "stdlib.h" #include "stdlib.h"
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#include <dirent.h> #include <filesystem>
#include <numeric> #include <numeric>
#include <stdlib.h> #include <stdlib.h>
#include <string> #include <string>
...@@ -31,6 +31,7 @@ ...@@ -31,6 +31,7 @@
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
namespace fs = std::filesystem;
namespace turbomind { namespace turbomind {
Tensor::Tensor(): Tensor::Tensor():
...@@ -44,7 +45,7 @@ Tensor::Tensor(): ...@@ -44,7 +45,7 @@ Tensor::Tensor():
} }
Tensor::Tensor(const MemoryType _where, const DataType _type, const std::vector<size_t> _shape, const void* _data): Tensor::Tensor(const MemoryType _where, const DataType _type, const std::vector<size_t> _shape, const void* _data):
where(_where), type(_type), shape(_shape), data(_data) where(_where), type(_type), shape(_shape), data(const_cast<void*>(_data))
{ {
} }
...@@ -53,7 +54,7 @@ Tensor::Tensor(const MemoryType _where, ...@@ -53,7 +54,7 @@ Tensor::Tensor(const MemoryType _where,
const std::vector<size_t> _shape, const std::vector<size_t> _shape,
const void* _data, const void* _data,
const std::vector<size_t> _offset): const std::vector<size_t> _offset):
where(_where), type(_type), shape(_shape), data(_data), offsets(_offset) where(_where), type(_type), shape(_shape), data(const_cast<void*>(_data)), offsets(_offset)
{ {
} }
...@@ -407,13 +408,9 @@ std::string TensorMap::toString() ...@@ -407,13 +408,9 @@ std::string TensorMap::toString()
TensorMap TensorMap::fromNpyFolder(const std::string& base_folder) TensorMap TensorMap::fromNpyFolder(const std::string& base_folder)
{ {
DIR* dir_p = opendir(base_folder.c_str());
FT_CHECK_WITH_INFO(dir_p != nullptr, fmtstr("Could not open folder %s. ", base_folder.c_str()));
struct dirent* dp;
TensorMap ret_tensor; TensorMap ret_tensor;
while ((dp = readdir(dir_p)) != nullptr) { for (auto const& entry : fs::directory_iterator{base_folder}) {
std::string filename(dp->d_name); std::string filename = entry.path().stem().string();
size_t len = filename.length(); size_t len = filename.length();
if (len < 4 || filename.compare(len - 4, 4, ".npy")) { if (len < 4 || filename.compare(len - 4, 4, ".npy")) {
continue; continue;
...@@ -439,18 +436,13 @@ TensorMap TensorMap::fromNpyFolder(const std::string& base_folder) ...@@ -439,18 +436,13 @@ TensorMap TensorMap::fromNpyFolder(const std::string& base_folder)
ret_tensor.tensor_map_.insert({key, Tensor::loadNpy(base_folder + "/" + filename, where)}); ret_tensor.tensor_map_.insert({key, Tensor::loadNpy(base_folder + "/" + filename, where)});
} }
closedir(dir_p);
return ret_tensor; return ret_tensor;
} }
void TensorMap::saveNpy(const std::string& base_folder) void TensorMap::saveNpy(const std::string& base_folder)
{ {
mode_t mode_0755 = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; bool ret = fs::exists(base_folder) | fs::create_directory(base_folder);
int ret = mkdir(base_folder.c_str(), mode_0755); FT_CHECK_WITH_INFO(ret == true, fmtstr("Could not create folder %s.\n", base_folder.c_str()));
FT_CHECK_WITH_INFO(ret == 0 || errno == EEXIST, fmtstr("Could not create folder %s.\n", base_folder.c_str()));
for (const auto& item : tensor_map_) { for (const auto& item : tensor_map_) {
item.second.saveNpy(base_folder + "/" + item.second.whereToString() + "-" + item.first + ".npy"); item.second.saveNpy(base_folder + "/" + item.second.whereToString() + "-" + item.first + ".npy");
} }
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#pragma once #pragma once
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h" #include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/cuda_fp8_utils.h" #include "src/turbomind/utils/cuda_fp8_utils.h"
#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
...@@ -24,7 +25,6 @@ ...@@ -24,7 +25,6 @@
#include "stdlib.h" #include "stdlib.h"
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#include <dirent.h>
#include <numeric> #include <numeric>
#include <stdlib.h> #include <stdlib.h>
#include <string> #include <string>
...@@ -107,11 +107,11 @@ typedef enum memorytype_enum ...@@ -107,11 +107,11 @@ typedef enum memorytype_enum
} MemoryType; } MemoryType;
struct Tensor { struct Tensor {
const MemoryType where; MemoryType where;
const DataType type; DataType type;
const std::vector<size_t> shape; std::vector<size_t> shape;
const void* data; // TODO(bhseuh) modify from const void* to void* const void* data;
const std::vector<size_t> offsets = std::vector<size_t>{}; std::vector<size_t> offsets = std::vector<size_t>{};
Tensor(); Tensor();
Tensor(const MemoryType _where, const DataType _type, const std::vector<size_t> _shape, const void* _data); Tensor(const MemoryType _where, const DataType _type, const std::vector<size_t> _shape, const void* _data);
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#pragma once #pragma once
#include "cuda_utils.h" #include "cuda_utils.h"
#include "src/turbomind/macro.h"
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "cublasFP8MMWrapper.h" #include "cublasFP8MMWrapper.h"
#include "cuda_utils.h" #include "cuda_utils.h"
#include "src/turbomind/macro.h"
namespace turbomind { namespace turbomind {
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "cublasMMWrapper.h" #include "cublasMMWrapper.h"
#include "cuda_utils.h" #include "cuda_utils.h"
#include "src/turbomind/macro.h"
#ifndef CUDART_VERSION #ifndef CUDART_VERSION
#error CUDART_VERSION Undefined! #error CUDART_VERSION Undefined!
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "cuda_utils.h" #include "cuda_utils.h"
#include "src/turbomind/utils/allocator.h" #include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cublasAlgoMap.h" #include "src/turbomind/utils/cublasAlgoMap.h"
#include <array>
#include <cublasLt.h> #include <cublasLt.h>
#include <cublas_v2.h> #include <cublas_v2.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
*/ */
#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/cuda_fp8_utils.h" #include "src/turbomind/utils/cuda_fp8_utils.h"
namespace turbomind { namespace turbomind {
...@@ -250,6 +251,8 @@ void printMatrix(int* ptr, int m, int k, int stride, bool is_device_ptr) ...@@ -250,6 +251,8 @@ void printMatrix(int* ptr, int m, int k, int stride, bool is_device_ptr)
} }
} }
// multiple definitions for msvc
#ifndef _MSC_VER
void printMatrix(size_t* ptr, int m, int k, int stride, bool is_device_ptr) void printMatrix(size_t* ptr, int m, int k, int stride, bool is_device_ptr)
{ {
typedef size_t T; typedef size_t T;
...@@ -286,6 +289,7 @@ void printMatrix(size_t* ptr, int m, int k, int stride, bool is_device_ptr) ...@@ -286,6 +289,7 @@ void printMatrix(size_t* ptr, int m, int k, int stride, bool is_device_ptr)
free(tmp); free(tmp);
} }
} }
#endif
template<typename T> template<typename T>
void check_max_val(const T* result, const int size) void check_max_val(const T* result, const int size)
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#pragma once #pragma once
#include "3rdparty/INIReader.h" #include "3rdparty/INIReader.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h" #include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/logger.h" #include "src/turbomind/utils/logger.h"
...@@ -264,11 +265,6 @@ public: ...@@ -264,11 +265,6 @@ public:
~CudaTimer() {} ~CudaTimer() {}
}; };
static double diffTime(timeval start, timeval end)
{
return (end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001;
}
/* ***************************** common utils ****************************** */ /* ***************************** common utils ****************************** */
inline void print_mem_usage(std::string time = "after allocation") inline void print_mem_usage(std::string time = "after allocation")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment