Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Lmdeploy
Commits
9484fd1c
"vscode:/vscode.git/clone" did not exist on "c11fdad82d3238983a472c8c123f8e1cf2b3128b"
Commit
9484fd1c
authored
Dec 20, 2023
by
xiabo
Browse files
Adapt to 0.1.0
parent
477f2db8
Changes
56
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
392 additions
and
295 deletions
+392
-295
src/turbomind/kernels/sampling_topk_kernels.cu
src/turbomind/kernels/sampling_topk_kernels.cu
+2
-1
src/turbomind/kernels/sampling_topp_kernels.cu
src/turbomind/kernels/sampling_topp_kernels.cu
+2
-1
src/turbomind/kernels/stop_criteria_kernels.cu
src/turbomind/kernels/stop_criteria_kernels.cu
+2
-1
src/turbomind/kernels/unfused_attention_kernels.cu
src/turbomind/kernels/unfused_attention_kernels.cu
+21
-4
src/turbomind/layers/CMakeLists.txt
src/turbomind/layers/CMakeLists.txt
+7
-5
src/turbomind/layers/sampling_layers/CMakeLists.txt
src/turbomind/layers/sampling_layers/CMakeLists.txt
+14
-10
src/turbomind/models/llama/CMakeLists.txt
src/turbomind/models/llama/CMakeLists.txt
+14
-12
src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+11
-2
src/turbomind/models/llama/LlamaLinear.h
src/turbomind/models/llama/LlamaLinear.h
+18
-18
src/turbomind/models/llama/llama_decoder_kernels.cu
src/turbomind/models/llama/llama_decoder_kernels.cu
+49
-14
src/turbomind/models/llama/llama_kernels.cu
src/turbomind/models/llama/llama_kernels.cu
+43
-43
src/turbomind/python/bind.cpp
src/turbomind/python/bind.cpp
+17
-16
src/turbomind/triton_backend/CMakeLists.txt
src/turbomind/triton_backend/CMakeLists.txt
+30
-16
src/turbomind/triton_backend/llama/CMakeLists.txt
src/turbomind/triton_backend/llama/CMakeLists.txt
+5
-3
src/turbomind/triton_backend/llama/LlamaTritonModel.cc
src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+3
-2
src/turbomind/utils/CMakeLists.txt
src/turbomind/utils/CMakeLists.txt
+50
-44
src/turbomind/utils/allocator.h
src/turbomind/utils/allocator.h
+32
-32
src/turbomind/utils/cublasAlgoMap.cc
src/turbomind/utils/cublasAlgoMap.cc
+2
-1
src/turbomind/utils/cublasFP8MMWrapper.cu
src/turbomind/utils/cublasFP8MMWrapper.cu
+8
-8
src/turbomind/utils/cublasINT8MMWrapper.cc
src/turbomind/utils/cublasINT8MMWrapper.cc
+62
-62
No files found.
src/turbomind/kernels/sampling_topk_kernels.cu
View file @
9484fd1c
...
@@ -21,7 +21,8 @@
...
@@ -21,7 +21,8 @@
#elif (CUDART_VERSION >= 11000)
#elif (CUDART_VERSION >= 11000)
#include <cub/cub.cuh>
#include <cub/cub.cuh>
#else
#else
#include "3rdparty/cub/cub.cuh"
// #include "3rdparty/cub/cub.cuh"
#include <cub/cub.cuh>
#endif
#endif
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
...
...
src/turbomind/kernels/sampling_topp_kernels.cu
View file @
9484fd1c
...
@@ -19,7 +19,8 @@
...
@@ -19,7 +19,8 @@
#elif (CUDART_VERSION >= 11000)
#elif (CUDART_VERSION >= 11000)
#include <cub/cub.cuh>
#include <cub/cub.cuh>
#else
#else
#include "3rdparty/cub/cub.cuh"
// #include "3rdparty/cub/cub.cuh"
#include <cub/cub.cuh>
#endif
#endif
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
...
...
src/turbomind/kernels/stop_criteria_kernels.cu
View file @
9484fd1c
...
@@ -145,7 +145,8 @@ void invokeLengthCriterion(bool* finished,
...
@@ -145,7 +145,8 @@ void invokeLengthCriterion(bool* finished,
// Check if we have attained the sequence length limit. If so, stop the sequence.
// Check if we have attained the sequence length limit. If so, stop the sequence.
// In addition, check if all sequences are stopped and return the result in should_stop
// In addition, check if all sequences are stopped and return the result in should_stop
TM_LOG_DEBUG
(
"%s start"
,
__PRETTY_FUNCTION__
);
TM_LOG_DEBUG
(
"%s start"
,
__PRETTY_FUNCTION__
);
dim3
block
{
min
(
512
,
uint32_t
(
batch_size
*
beam_width
))};
// dim3 block{min(512, uint32_t(batch_size * beam_width))};
dim3
block
{
static_cast
<
unsigned
int
>
(
min
(
512
,
uint32_t
(
batch_size
*
beam_width
)))};
dim3
grid
{
1
};
dim3
grid
{
1
};
h_pinned_finished_sum_
[
0
]
=
-
1
;
h_pinned_finished_sum_
[
0
]
=
-
1
;
...
...
src/turbomind/kernels/unfused_attention_kernels.cu
View file @
9484fd1c
...
@@ -178,7 +178,11 @@ __global__ void softmax_kernel_h2(T* attn_score,
...
@@ -178,7 +178,11 @@ __global__ void softmax_kernel_h2(T* attn_score,
qk_bias
=
hadd2
<
T2
>
(
qk_bias
,
hmul2
<
T2
>
(
hsub2
<
T2
>
(
ONE
,
mask_val
),
NEG_INFTY
));
qk_bias
=
hadd2
<
T2
>
(
qk_bias
,
hmul2
<
T2
>
(
hsub2
<
T2
>
(
ONE
,
mask_val
),
NEG_INFTY
));
data
[
i
]
=
hadd2
<
T2
>
(
hmul2
<
T2
>
(
qk
,
qk_scale_h2
),
qk_bias
);
data
[
i
]
=
hadd2
<
T2
>
(
hmul2
<
T2
>
(
qk
,
qk_scale_h2
),
qk_bias
);
local_max
=
fmax
(
local_max
,
fmax
((
float
)
data
[
i
].
x
,
(
float
)
data
[
i
].
y
));
// if (std::is_same<T2, half2>::value) {
local_max
=
fmax
(
local_max
,
fmax
((
float
)
data
[
i
].
data
[
0
],
(
float
)
data
[
i
].
data
[
1
]));
// } else {
// local_max = fmax(local_max, fmax((float)data[i].x, (float)data[i].y));
// }
}
}
float
max_val
=
blockDim
.
x
<=
32
?
warpReduceMax
(
local_max
)
:
blockReduceMax
<
float
>
(
local_max
);
float
max_val
=
blockDim
.
x
<=
32
?
warpReduceMax
(
local_max
)
:
blockReduceMax
<
float
>
(
local_max
);
...
@@ -190,7 +194,11 @@ __global__ void softmax_kernel_h2(T* attn_score,
...
@@ -190,7 +194,11 @@ __global__ void softmax_kernel_h2(T* attn_score,
float
local_sum
=
0.0
f
;
float
local_sum
=
0.0
f
;
for
(
int
i
=
0
;
blockDim
.
x
*
i
+
threadIdx
.
x
<
(
k_length
/
2
)
&&
i
<
ITEMS_PER_THREAD
;
i
++
)
{
for
(
int
i
=
0
;
blockDim
.
x
*
i
+
threadIdx
.
x
<
(
k_length
/
2
)
&&
i
<
ITEMS_PER_THREAD
;
i
++
)
{
data
[
i
]
=
hexp2
<
T2
>
(
hsub2
<
T2
>
(
data
[
i
],
cuda_cast
<
T2
>
(
s_max
)));
data
[
i
]
=
hexp2
<
T2
>
(
hsub2
<
T2
>
(
data
[
i
],
cuda_cast
<
T2
>
(
s_max
)));
local_sum
+=
(
float
)(
data
[
i
].
x
+
data
[
i
].
y
);
// if (std::is_same<T2, half2>::value) {
local_sum
+=
(
float
)(
data
[
i
].
data
[
0
]
+
data
[
i
].
data
[
1
]);
// } else {
// local_sum += (float)(data[i].x + data[i].y);
// }
}
}
float
sum_val
=
blockDim
.
x
<=
32
?
warpReduceSum
(
local_sum
)
:
blockReduceSum
<
float
>
(
local_sum
);
float
sum_val
=
blockDim
.
x
<=
32
?
warpReduceSum
(
local_sum
)
:
blockReduceSum
<
float
>
(
local_sum
);
...
@@ -310,7 +318,11 @@ __global__ void softmax_kernel_h2_v2(T* attn_score,
...
@@ -310,7 +318,11 @@ __global__ void softmax_kernel_h2_v2(T* attn_score,
val
=
hadd2
<
T2
>
(
val
,
pos_bias
[
j
]);
val
=
hadd2
<
T2
>
(
val
,
pos_bias
[
j
]);
}
}
data
[
j
][
i
]
=
val
;
data
[
j
][
i
]
=
val
;
local_max
[
j
]
=
fmax
(
local_max
[
j
],
fmax
((
float
)
data
[
j
][
i
].
x
,
(
float
)
data
[
j
][
i
].
y
));
// if (std::is_same<T2, half2>::value) {
local_max
[
j
]
=
fmax
(
local_max
[
j
],
fmax
((
float
)
data
[
j
][
i
].
data
[
0
],
(
float
)
data
[
j
][
i
].
data
[
1
]));
// } else {
// local_max[j] = fmax(local_max[j], fmax((float)data[j][i].x, (float)data[j][i].y));
// }
}
}
}
}
...
@@ -343,7 +355,11 @@ __global__ void softmax_kernel_h2_v2(T* attn_score,
...
@@ -343,7 +355,11 @@ __global__ void softmax_kernel_h2_v2(T* attn_score,
#pragma unroll
#pragma unroll
for
(
int
j
=
0
;
j
<
Q_ITEMS
;
j
++
)
{
for
(
int
j
=
0
;
j
<
Q_ITEMS
;
j
++
)
{
local_sum
[
j
]
+=
(
float
)(
data
[
j
][
i
].
x
+
data
[
j
][
i
].
y
);
// if (std::is_same<T2, half2>::value) {
local_sum
[
j
]
+=
(
float
)(
data
[
j
][
i
].
data
[
0
]
+
data
[
j
][
i
].
data
[
1
]);
// } else {
// local_sum[j] += (float)(data[j][i].x + data[j][i].y);
// }
}
}
}
}
...
@@ -1885,6 +1901,7 @@ void invokeMaskedSoftMaxWithRelPosBias(T* qk_buf,
...
@@ -1885,6 +1901,7 @@ void invokeMaskedSoftMaxWithRelPosBias(T* qk_buf,
qk_scale
);
qk_scale
);
}
}
else
if
(
std
::
is_same
<
T
,
half
>::
value
)
{
else
if
(
std
::
is_same
<
T
,
half
>::
value
)
{
printf
(
"============xiabo_test %s:%d
\n
"
,
__FILE__
,
__LINE__
);
softmax_withRelPosBias_element2_kernel
<
half2
,
half
>
softmax_withRelPosBias_element2_kernel
<
half2
,
half
>
<<<
grid
,
block
,
0
,
stream
>>>
((
half2
*
)
qk_buf
,
<<<
grid
,
block
,
0
,
stream
>>>
((
half2
*
)
qk_buf
,
(
const
half2
*
)
attn_mask
,
(
const
half2
*
)
attn_mask
,
...
...
src/turbomind/layers/CMakeLists.txt
View file @
9484fd1c
...
@@ -13,12 +13,14 @@
...
@@ -13,12 +13,14 @@
# limitations under the License.
# limitations under the License.
cmake_minimum_required
(
VERSION 3.8
)
cmake_minimum_required
(
VERSION 3.8
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fPIC"
)
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
-fPIC"
)
add_subdirectory
(
sampling_layers
)
add_subdirectory
(
sampling_layers
)
find_package
(
CUDAToolkit REQUIRED
)
#find_package(CUDAToolkit REQUIRED)
find_package
(
CUDA REQUIRED
)
add_library
(
DynamicDecodeLayer STATIC DynamicDecodeLayer.cc
)
add_library
(
DynamicDecodeLayer STATIC DynamicDecodeLayer.cc
)
set_property
(
TARGET DynamicDecodeLayer PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET DynamicDecodeLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET DynamicDecodeLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET DynamicDecodeLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
DynamicDecodeLayer PUBLIC
CUDA::
cudart TopKSamplingLayer
target_link_libraries
(
DynamicDecodeLayer PUBLIC cudart TopKSamplingLayer
TopPSamplingLayer ban_bad_words stop_criteria gpt_kernels tensor nvtx_utils
)
TopPSamplingLayer ban_bad_words stop_criteria gpt_kernels tensor nvtx_utils
)
src/turbomind/layers/sampling_layers/CMakeLists.txt
View file @
9484fd1c
...
@@ -14,19 +14,23 @@
...
@@ -14,19 +14,23 @@
cmake_minimum_required
(
VERSION 3.8
)
cmake_minimum_required
(
VERSION 3.8
)
find_package
(
CUDAToolkit REQUIRED
)
#find_package(CUDAToolkit REQUIRED)
find_package
(
CUDA REQUIRED
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fPIC"
)
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
-fPIC"
)
add_library
(
BaseSamplingLayer STATIC BaseSamplingLayer.cc
)
add_library
(
BaseSamplingLayer STATIC BaseSamplingLayer.cc
)
set_property
(
TARGET BaseSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET BaseSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET BaseSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET BaseSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
BaseSamplingLayer PUBLIC
CUDA::
cudart sampling_penalty_kernels memory_utils
)
target_link_libraries
(
BaseSamplingLayer PUBLIC cudart sampling_penalty_kernels memory_utils
)
add_library
(
TopKSamplingLayer STATIC TopKSamplingLayer.cu
)
add_library
(
TopKSamplingLayer STATIC TopKSamplingLayer.cu
)
set_property
(
TARGET TopKSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET TopKSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET TopKSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET TopKSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
TopKSamplingLayer PUBLIC
CUDA::
cudart BaseSamplingLayer sampling_topk_kernels
)
target_link_libraries
(
TopKSamplingLayer PUBLIC cudart BaseSamplingLayer sampling_topk_kernels
)
add_library
(
TopPSamplingLayer STATIC TopPSamplingLayer.cu
)
add_library
(
TopPSamplingLayer STATIC TopPSamplingLayer.cu
)
set_property
(
TARGET TopPSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET TopPSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET TopPSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET TopPSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
TopPSamplingLayer PUBLIC
CUDA::
cudart BaseSamplingLayer sampling_topk_kernels sampling_topp_kernels
)
target_link_libraries
(
TopPSamplingLayer PUBLIC cudart BaseSamplingLayer sampling_topk_kernels sampling_topp_kernels
)
src/turbomind/models/llama/CMakeLists.txt
View file @
9484fd1c
...
@@ -2,9 +2,10 @@
...
@@ -2,9 +2,10 @@
cmake_minimum_required
(
VERSION 3.8
)
cmake_minimum_required
(
VERSION 3.8
)
add_subdirectory
(
fused_multi_head_attention
)
#
add_subdirectory(fused_multi_head_attention)
find_package
(
CUDAToolkit REQUIRED
)
#find_package(CUDAToolkit REQUIRED)
find_package
(
CUDA REQUIRED
)
add_library
(
Llama STATIC
add_library
(
Llama STATIC
LlamaV2.cc
LlamaV2.cc
...
@@ -19,10 +20,12 @@ add_library(Llama STATIC
...
@@ -19,10 +20,12 @@ add_library(Llama STATIC
llama_kernels.cu
llama_kernels.cu
llama_decoder_kernels.cu
llama_decoder_kernels.cu
llama_utils.cu
)
llama_utils.cu
)
set_property
(
TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE ON
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fPIC"
)
set_property
(
TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
-fPIC"
)
target_link_libraries
(
Llama PUBLIC CUDA::cudart
#set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE ON)
gemm_s4_f16
#set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
Llama PUBLIC cudart
# gemm_s4_f16
cublasMMWrapper
cublasMMWrapper
DynamicDecodeLayer
DynamicDecodeLayer
activation_kernels
activation_kernels
...
@@ -38,17 +41,16 @@ target_link_libraries(Llama PUBLIC CUDA::cudart
...
@@ -38,17 +41,16 @@ target_link_libraries(Llama PUBLIC CUDA::cudart
memory_utils
memory_utils
nccl_utils
nccl_utils
cuda_utils
cuda_utils
logger
logger
)
llama_fmha
)
#
llama_fmha)
if
(
NOT MSVC
)
if
(
NOT MSVC
)
add_subdirectory
(
flash_attention2
)
#
add_subdirectory(flash_attention2)
target_link_libraries
(
Llama PUBLIC flash_attention2
)
#
target_link_libraries(Llama PUBLIC flash_attention2)
endif
()
endif
()
add_executable
(
llama_gemm llama_gemm.cc
)
add_executable
(
llama_gemm llama_gemm.cc
)
target_link_libraries
(
llama_gemm PUBLIC CUDA::cudart gpt_gemm_func memory_utils cuda_utils logger
)
target_link_libraries
(
llama_gemm PUBLIC cudart gpt_gemm_func memory_utils cuda_utils logger
)
install
(
TARGETS llama_gemm DESTINATION
${
CMAKE_SOURCE_DIR
}
/lmdeploy/bin
)
install
(
TARGETS llama_gemm DESTINATION
${
CMAKE_SOURCE_DIR
}
/lmdeploy/bin
)
find_package
(
Catch2 3 QUIET
)
find_package
(
Catch2 3 QUIET
)
...
...
src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
View file @
9484fd1c
...
@@ -22,10 +22,18 @@
...
@@ -22,10 +22,18 @@
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/memory_utils.h"
#include <filesystem>
// #include <filesystem>
#include <experimental/filesystem>
#include <sys/stat.h>
#include <string>
namespace
turbomind
{
namespace
turbomind
{
bool
fileExists
(
const
std
::
string
&
path
)
{
struct
stat
buffer
;
return
(
stat
(
path
.
c_str
(),
&
buffer
)
==
0
);
}
template
<
typename
T
>
template
<
typename
T
>
LlamaDecoderLayerWeight
<
T
>::
LlamaDecoderLayerWeight
(
size_t
head_num
,
LlamaDecoderLayerWeight
<
T
>::
LlamaDecoderLayerWeight
(
size_t
head_num
,
size_t
kv_head_num
,
size_t
kv_head_num
,
...
@@ -170,7 +178,8 @@ void loadWeights(LlamaDenseWeight<T>& w,
...
@@ -170,7 +178,8 @@ void loadWeights(LlamaDenseWeight<T>& w,
}
}
else
{
else
{
// Disable slice if weight has already been sliced
// Disable slice if weight has already been sliced
if
(
std
::
filesystem
::
exists
(
max_prefix
+
".weight"
)
||
std
::
filesystem
::
exists
(
max_prefix
+
".qweight"
))
{
// if (std::filesystem::exists(max_prefix + ".weight") || std::filesystem::exists(max_prefix + ".qweight")) {
if
(
fileExists
(
max_prefix
+
".weight"
)
||
fileExists
(
max_prefix
+
".qweight"
))
{
TM_LOG_DEBUG
(
"TP weight exists. Disable runtime TP."
);
TM_LOG_DEBUG
(
"TP weight exists. Disable runtime TP."
);
enable_slice
=
false
;
enable_slice
=
false
;
}
}
...
...
src/turbomind/models/llama/LlamaLinear.h
View file @
9484fd1c
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
#pragma once
#pragma once
#include "src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h"
//
#include "src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h"
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
...
@@ -62,29 +62,29 @@ private:
...
@@ -62,29 +62,29 @@ private:
void
forwardInt4
(
T
*
output_data
,
const
T
*
input_data
,
int
batch_size
,
const
LlamaDenseWeight
<
T
>&
weight
,
Type
type
)
void
forwardInt4
(
T
*
output_data
,
const
T
*
input_data
,
int
batch_size
,
const
LlamaDenseWeight
<
T
>&
weight
,
Type
type
)
{
{
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
//
if constexpr (std::is_same_v<T, half>) {
gemm_s4_f16_
.
Run
(
output_data
,
//
gemm_s4_f16_.Run(output_data,
(
const
uint
*
)
weight
.
kernel
,
//
(const uint*)weight.kernel,
input_data
,
//
input_data,
(
const
half2
*
)
weight
.
scales_and_zeros
,
//
(const half2*)weight.scales_and_zeros,
weight
.
output_dims
,
//
weight.output_dims,
batch_size
,
//
batch_size,
weight
.
input_dims
,
//
weight.input_dims,
weight
.
group_size
,
//
weight.group_size,
type
==
kFusedSiluFfn
?
GemmS4F16
::
kFusedSiluFfn
:
GemmS4F16
::
kGemm
,
//
type == kFusedSiluFfn ? GemmS4F16::kFusedSiluFfn : GemmS4F16::kGemm,
-
1
,
//
-1,
stream_
);
//
stream_);
sync_check_cuda_error
();
//
sync_check_cuda_error();
}
//
}
else
{
//
else {
FT_CHECK_WITH_INFO
(
0
,
"Not implemented"
);
FT_CHECK_WITH_INFO
(
0
,
"Not implemented"
);
}
//
}
}
}
private:
private:
cublasMMWrapper
*
cublas_wrapper_
;
cublasMMWrapper
*
cublas_wrapper_
;
cudaStream_t
stream_
{};
cudaStream_t
stream_
{};
GemmS4F16
gemm_s4_f16_
;
//
GemmS4F16 gemm_s4_f16_;
};
};
}
// namespace turbomind
}
// namespace turbomind
src/turbomind/models/llama/llama_decoder_kernels.cu
View file @
9484fd1c
...
@@ -110,24 +110,58 @@ struct res_norm_ops_t<__nv_bfloat16> {
...
@@ -110,24 +110,58 @@ struct res_norm_ops_t<__nv_bfloat16> {
#endif
#endif
template
<
typename
T
>
//
template<typename T>
__device__
T
blockReduceSum
(
const
cg
::
thread_block
&
block
,
T
value
)
//
__device__ T blockReduceSum(const cg::thread_block& block, T value)
{
//
{
__shared__
float
partial
[
32
];
//
__shared__ float partial[32];
auto
tile
=
cg
::
tiled_partition
<
32
>
(
block
);
//
auto tile = cg::tiled_partition<32>(block);
value
=
cg
::
reduce
(
tile
,
value
,
cg
::
plus
<
float
>
{});
//
value = cg::reduce(tile, value, cg::plus<float>{});
if
(
tile
.
thread_rank
()
==
0
)
{
//
if (tile.thread_rank() == 0) {
partial
[
tile
.
meta_group_rank
()]
=
value
;
//
partial[tile.meta_group_rank()] = value;
}
//
}
block
.
sync
();
//
block.sync();
value
=
tile
.
thread_rank
()
<
tile
.
meta_group_size
()
?
partial
[
tile
.
thread_rank
()]
:
T
{};
// value = tile.thread_rank() < tile.meta_group_size() ? partial[tile.thread_rank()] : T{};
return
cg
::
reduce
(
tile
,
value
,
cg
::
plus
<
float
>
{});
// return cg::reduce(tile, value, cg::plus<float>{});
// }
#define WARPSIZE 64
template
<
typename
T
>
__inline__
__device__
T
warpReduceSum_xiabo
(
T
value
)
{
#pragma unroll
for
(
int
offset
=
WARPSIZE
/
2
;
offset
>
0
;
offset
>>=
1
)
value
+=
__shfl_down_sync
(
0xffffffff
,
value
,
offset
);
return
value
;
}
}
template
<
typename
T
>
__inline__
__device__
T
blockReduceSum_xiabo
(
T
val
)
{
T
sum
=
(
T
)(
0.0
f
);
__shared__
T
shared
[
WARPSIZE
];
sum
=
warpReduceSum_xiabo
(
val
);
__syncthreads
();
int
tid
=
threadIdx
.
x
+
threadIdx
.
y
*
blockDim
.
x
;
if
(
tid
%
WARPSIZE
==
0
)
{
shared
[
tid
/
WARPSIZE
]
=
sum
;
}
if
(
tid
>=
blockDim
.
x
*
blockDim
.
y
/
WARPSIZE
&&
tid
<
WARPSIZE
)
{
shared
[
tid
]
=
(
T
)(
0.0
f
);
}
__syncthreads
();
if
(
tid
/
WARPSIZE
==
0
)
{
sum
=
warpReduceSum_xiabo
(
shared
[
tid
]);
if
(
tid
==
0
)
{
shared
[
0
]
=
sum
;
}
}
__syncthreads
();
return
shared
[
0
];
}
// r' = r + x
// r' = r + x
// x' = norm(r') * scales
// x' = norm(r') * scales
template
<
typename
T
>
template
<
typename
T
>
...
@@ -140,7 +174,7 @@ __global__ void fusedAddBiasResidualNorm(T* __restrict__ r_data,
...
@@ -140,7 +174,7 @@ __global__ void fusedAddBiasResidualNorm(T* __restrict__ r_data,
int
n_dims
)
int
n_dims
)
{
{
auto
block
=
cg
::
this_thread_block
();
auto
block
=
cg
::
this_thread_block
();
auto
grid
=
cg
::
this_grid
();
//
auto grid = cg::this_grid();
constexpr
int
PACK_DIM
=
sizeof
(
uint4
)
/
sizeof
(
T
);
constexpr
int
PACK_DIM
=
sizeof
(
uint4
)
/
sizeof
(
T
);
...
@@ -160,7 +194,8 @@ __global__ void fusedAddBiasResidualNorm(T* __restrict__ r_data,
...
@@ -160,7 +194,8 @@ __global__ void fusedAddBiasResidualNorm(T* __restrict__ r_data,
r_ptr
[
i
]
=
r
;
r_ptr
[
i
]
=
r
;
}
}
auto
total_sum
=
blockReduceSum
(
block
,
thread_sum
);
// auto total_sum = blockReduceSum(block, thread_sum);
auto
total_sum
=
blockReduceSum_xiabo
(
thread_sum
);
float
s_inv_mean
=
rsqrt
(
total_sum
/
n_dims
+
eps
);
float
s_inv_mean
=
rsqrt
(
total_sum
/
n_dims
+
eps
);
...
...
src/turbomind/models/llama/llama_kernels.cu
View file @
9484fd1c
...
@@ -918,50 +918,50 @@ void invokeBatchedCopy(void** src_ptr, void** dst_ptr, int* size, int count, cud
...
@@ -918,50 +918,50 @@ void invokeBatchedCopy(void** src_ptr, void** dst_ptr, int* size, int count, cud
} \
} \
}()
}()
template
<
typename
T
>
//
template<typename T>
FlashAttentionOp
<
T
>::
FlashAttentionOp
(
int
batch_size
,
int
head_num
,
int
key_len
,
int
seq_len
,
int
size_per_head
)
:
//
FlashAttentionOp<T>::FlashAttentionOp(int batch_size, int head_num, int key_len, int seq_len, int size_per_head):
batch_size_
(
batch_size
),
head_num_
(
head_num
),
key_len_
(
key_len
),
seq_len_
(
seq_len
),
size_per_head_
(
size_per_head
)
//
batch_size_(batch_size), head_num_(head_num), key_len_(key_len), seq_len_(seq_len), size_per_head_(size_per_head)
{
//
{
#ifdef _MSC_VER
//
#ifdef _MSC_VER
op_version_
=
1
;
//
op_version_ = 1;
#else
//
#else
op_version_
=
std
::
is_same
<
float
,
typename
std
::
decay
<
T
>::
type
>::
value
?
1
:
2
;
//
op_version_ = std::is_same<float, typename std::decay<T>::type>::value ? 1 : 2;
if
(
op_version_
==
2
&&
getSMVersion
()
<
80
)
{
//
if (op_version_ == 2 && getSMVersion() < 80) {
op_version_
=
1
;
//
op_version_ = 1;
}
//
}
#endif
//
#endif
}
//
}
template
<
typename
T
>
//
template<typename T>
int
FlashAttentionOp
<
T
>::
get_workspace_size
()
const
//
int FlashAttentionOp<T>::get_workspace_size() const
{
//
{
#ifdef _MSC_VER
//
#ifdef _MSC_VER
FlashAttentionOpImpl
<
T
,
1
>
attention_op
(
batch_size_
,
head_num_
,
key_len_
,
seq_len_
,
size_per_head_
);
//
FlashAttentionOpImpl<T, 1> attention_op(batch_size_, head_num_, key_len_, seq_len_, size_per_head_);
return
attention_op
.
get_workspace_size
();
//
return attention_op.get_workspace_size();
#else
//
#else
return
VERSION_SWITCH
(
op_version_
,
OP_VERSION
,
[
&
]()
{
//
return VERSION_SWITCH(op_version_, OP_VERSION, [&]() {
FlashAttentionOpImpl
<
T
,
OP_VERSION
>
attention_op
(
batch_size_
,
head_num_
,
key_len_
,
seq_len_
,
size_per_head_
);
//
FlashAttentionOpImpl<T, OP_VERSION> attention_op(batch_size_, head_num_, key_len_, seq_len_, size_per_head_);
return
attention_op
.
get_workspace_size
();
//
return attention_op.get_workspace_size();
});
//
});
#endif
//
#endif
}
//
}
template
<
typename
T
>
//
template<typename T>
void
FlashAttentionOp
<
T
>::
operator
()(
Params
&
params
,
cudaStream_t
st
)
const
//
void FlashAttentionOp<T>::operator()(Params& params, cudaStream_t st) const
{
//
{
#ifdef _MSC_VER
//
#ifdef _MSC_VER
FlashAttentionOpImpl
<
T
,
1
>
attention_op
(
batch_size_
,
head_num_
,
key_len_
,
seq_len_
,
size_per_head_
);
//
FlashAttentionOpImpl<T, 1> attention_op(batch_size_, head_num_, key_len_, seq_len_, size_per_head_);
return
attention_op
(
params
,
st
);
//
return attention_op(params, st);
#else
//
#else
return
VERSION_SWITCH
(
op_version_
,
OP_VERSION
,
[
&
]()
{
//
return VERSION_SWITCH(op_version_, OP_VERSION, [&]() {
FlashAttentionOpImpl
<
T
,
OP_VERSION
>
attention_op
(
batch_size_
,
head_num_
,
key_len_
,
seq_len_
,
size_per_head_
);
//
FlashAttentionOpImpl<T, OP_VERSION> attention_op(batch_size_, head_num_, key_len_, seq_len_, size_per_head_);
return
attention_op
(
params
,
st
);
//
return attention_op(params, st);
});
//
});
#endif
//
#endif
}
//
}
template
class
FlashAttentionOp
<
float
>;
//
template class FlashAttentionOp<float>;
template
class
FlashAttentionOp
<
half
>;
//
template class FlashAttentionOp<half>;
#ifdef ENABLE_BF16
#ifdef ENABLE_BF16
template
class
FlashAttentionOp
<
__nv_bfloat16
>;
template
class
FlashAttentionOp
<
__nv_bfloat16
>;
#endif
#endif
...
...
src/turbomind/python/bind.cpp
View file @
9484fd1c
#include "src/turbomind/kernels/gemm_s_f16/format.h"
//
#include "src/turbomind/kernels/gemm_s_f16/format.h"
#include "src/turbomind/python/dlpack.h"
#include "src/turbomind/python/dlpack.h"
#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
...
@@ -43,7 +43,8 @@ DLDevice getDLDevice(triton::Tensor& tensor)
...
@@ -43,7 +43,8 @@ DLDevice getDLDevice(triton::Tensor& tensor)
device
.
device_type
=
DLDeviceType
::
kDLCUDAHost
;
device
.
device_type
=
DLDeviceType
::
kDLCUDAHost
;
break
;
break
;
case
triton
::
MEMORY_GPU
:
case
triton
::
MEMORY_GPU
:
device
.
device_type
=
DLDeviceType
::
kDLCUDA
;
// device.device_type = DLDeviceType::kDLCUDA;
device
.
device_type
=
DLDeviceType
::
kDLROCM
;
break
;
break
;
default:
default:
break
;
break
;
...
@@ -456,15 +457,15 @@ PYBIND11_MODULE(_turbomind, m)
...
@@ -456,15 +457,15 @@ PYBIND11_MODULE(_turbomind, m)
auto
src_tensor
=
GetDLTensor
(
src
);
auto
src_tensor
=
GetDLTensor
(
src
);
auto
dst_tensor
=
GetDLTensor
(
dst
);
auto
dst_tensor
=
GetDLTensor
(
dst
);
turbomind
::
transpose_qk_s4_k_m8_hf
(
//
turbomind::transpose_qk_s4_k_m8_hf(
(
uint32_t
*
)
dst_tensor
.
data
,
(
const
uint32_t
*
)
src_tensor
.
data
,
m
,
k
,
size_per_head
,
nullptr
);
//
(uint32_t*)dst_tensor.data, (const uint32_t*)src_tensor.data, m, k, size_per_head, nullptr);
});
});
m
.
def
(
"fuse_w1_w3_s4_k_m8"
,
[](
py
::
object
src
,
py
::
object
dst
,
int
m
,
int
k
)
{
m
.
def
(
"fuse_w1_w3_s4_k_m8"
,
[](
py
::
object
src
,
py
::
object
dst
,
int
m
,
int
k
)
{
auto
src_tensor
=
GetDLTensor
(
src
);
auto
src_tensor
=
GetDLTensor
(
src
);
auto
dst_tensor
=
GetDLTensor
(
dst
);
auto
dst_tensor
=
GetDLTensor
(
dst
);
turbomind
::
fuse_w1_w3_s4_k_m8
((
uint32_t
*
)
dst_tensor
.
data
,
(
const
uint32_t
*
)
src_tensor
.
data
,
m
,
k
,
nullptr
);
//
turbomind::fuse_w1_w3_s4_k_m8((uint32_t*)dst_tensor.data, (const uint32_t*)src_tensor.data, m, k, nullptr);
});
});
m
.
def
(
"convert_s4_k_m8"
,
m
.
def
(
"convert_s4_k_m8"
,
...
@@ -484,16 +485,16 @@ PYBIND11_MODULE(_turbomind, m)
...
@@ -484,16 +485,16 @@ PYBIND11_MODULE(_turbomind, m)
auto
s
=
GetDLTensor
(
scales
);
auto
s
=
GetDLTensor
(
scales
);
auto
qz
=
GetDLTensor
(
qzeros
);
auto
qz
=
GetDLTensor
(
qzeros
);
turbomind
::
convert_s4_k_m8
((
uint32_t
*
)
a_dst
.
data
,
//
turbomind::convert_s4_k_m8((uint32_t*)a_dst.data,
(
half2
*
)
q_dst
.
data
,
//
(half2*)q_dst.data,
(
half
*
)
w
.
data
,
//
(half*)w.data,
(
const
uint32_t
*
)
a_src
.
data
,
//
(const uint32_t*)a_src.data,
(
const
half
*
)
s
.
data
,
//
(const half*)s.data,
(
const
uint32_t
*
)
qz
.
data
,
//
(const uint32_t*)qz.data,
m
,
//
m,
k
,
//
k,
group_size
,
//
group_size,
nullptr
);
//
nullptr);
});
});
m
.
def
(
"dequantize_s4"
,
[](
py
::
object
src
,
py
::
object
dst
)
{
m
.
def
(
"dequantize_s4"
,
[](
py
::
object
src
,
py
::
object
dst
)
{
...
@@ -502,6 +503,6 @@ PYBIND11_MODULE(_turbomind, m)
...
@@ -502,6 +503,6 @@ PYBIND11_MODULE(_turbomind, m)
auto
src_count
=
std
::
accumulate
(
src_tensor
.
shape
,
src_tensor
.
shape
+
src_tensor
.
ndim
,
size_t
{
1
});
auto
src_count
=
std
::
accumulate
(
src_tensor
.
shape
,
src_tensor
.
shape
+
src_tensor
.
ndim
,
size_t
{
1
});
auto
dst_count
=
std
::
accumulate
(
dst_tensor
.
shape
,
dst_tensor
.
shape
+
dst_tensor
.
ndim
,
size_t
{
1
});
auto
dst_count
=
std
::
accumulate
(
dst_tensor
.
shape
,
dst_tensor
.
shape
+
dst_tensor
.
ndim
,
size_t
{
1
});
turbomind
::
FT_CHECK
(
src_count
*
8
==
dst_count
);
turbomind
::
FT_CHECK
(
src_count
*
8
==
dst_count
);
turbomind
::
dequantize_s4
((
uint4
*
)
dst_tensor
.
data
,
(
uint32_t
*
)
src_tensor
.
data
,
src_count
,
nullptr
);
//
turbomind::dequantize_s4((uint4*)dst_tensor.data, (uint32_t*)src_tensor.data, src_count, nullptr);
});
});
}
}
src/turbomind/triton_backend/CMakeLists.txt
View file @
9484fd1c
...
@@ -24,13 +24,17 @@
...
@@ -24,13 +24,17 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmake_minimum_required
(
VERSION 3.18
)
#cmake_minimum_required (VERSION 3.18)
cmake_minimum_required
(
VERSION 3.16
)
project
(
tritonturbomindbackend LANGUAGES C CXX
)
project
(
tritonturbomindbackend LANGUAGES C CXX
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fPIC"
)
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
-fPIC"
)
add_library
(
TransformerTritonBackend STATIC transformer_triton_backend.cpp
)
add_library
(
TransformerTritonBackend STATIC transformer_triton_backend.cpp
)
target_link_libraries
(
TransformerTritonBackend PUBLIC nccl_utils
)
target_link_libraries
(
TransformerTritonBackend PUBLIC nccl_utils
)
set_property
(
TARGET TransformerTritonBackend PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET TransformerTritonBackend PROPERTY POSITION_INDEPENDENT_CODE ON)
install
(
TARGETS TransformerTritonBackend DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
)
install
(
TARGETS TransformerTritonBackend DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
)
add_subdirectory
(
llama
)
add_subdirectory
(
llama
)
...
@@ -70,21 +74,24 @@ include(FetchContent)
...
@@ -70,21 +74,24 @@ include(FetchContent)
FetchContent_Declare
(
FetchContent_Declare
(
repo-common
repo-common
GIT_REPOSITORY https://github.com/triton-inference-server/common.git
URL ../../../3rdparty/common-r22.12
GIT_TAG
${
TRITON_COMMON_REPO_TAG
}
#GIT_REPOSITORY https://github.com/triton-inference-server/common.git
GIT_SHALLOW ON
#GIT_TAG ${TRITON_COMMON_REPO_TAG}
#GIT_SHALLOW ON
)
)
FetchContent_Declare
(
FetchContent_Declare
(
repo-core
repo-core
GIT_REPOSITORY https://github.com/triton-inference-server/core.git
URL ../../../3rdparty/core-r22.12
GIT_TAG
${
TRITON_CORE_REPO_TAG
}
#GIT_REPOSITORY https://github.com/triton-inference-server/core.git
GIT_SHALLOW ON
#GIT_TAG ${TRITON_CORE_REPO_TAG}
#GIT_SHALLOW ON
)
)
FetchContent_Declare
(
FetchContent_Declare
(
repo-backend
repo-backend
GIT_REPOSITORY https://github.com/triton-inference-server/backend.git
URL ../../../3rdparty/backend-r22.12
GIT_TAG
${
TRITON_BACKEND_REPO_TAG
}
#GIT_REPOSITORY https://github.com/triton-inference-server/backend.git
GIT_SHALLOW ON
#GIT_TAG ${TRITON_BACKEND_REPO_TAG}
#GIT_SHALLOW ON
)
)
FetchContent_MakeAvailable
(
repo-common repo-core repo-backend
)
FetchContent_MakeAvailable
(
repo-common repo-core repo-backend
)
...
@@ -92,7 +99,8 @@ FetchContent_MakeAvailable(repo-common repo-core repo-backend)
...
@@ -92,7 +99,8 @@ FetchContent_MakeAvailable(repo-common repo-core repo-backend)
# CUDA
# CUDA
#
#
if
(
${
TRITON_ENABLE_GPU
}
)
if
(
${
TRITON_ENABLE_GPU
}
)
find_package
(
CUDAToolkit REQUIRED
)
#find_package(CUDAToolkit REQUIRED)
find_package
(
CUDA REQUIRED
)
endif
()
# TRITON_ENABLE_GPU
endif
()
# TRITON_ENABLE_GPU
#
#
...
@@ -109,7 +117,8 @@ add_library(
...
@@ -109,7 +117,8 @@ add_library(
TritonTurboMindBackend::triton-turbomind-backend ALIAS triton-turbomind-backend
TritonTurboMindBackend::triton-turbomind-backend ALIAS triton-turbomind-backend
)
)
find_package
(
CUDAToolkit REQUIRED
)
#find_package(CUDAToolkit REQUIRED)
find_package
(
CUDA REQUIRED
)
find_package
(
CUDA 10.1 REQUIRED
)
find_package
(
CUDA 10.1 REQUIRED
)
if
(
${
CUDA_VERSION
}
GREATER_EQUAL 11.0
)
if
(
${
CUDA_VERSION
}
GREATER_EQUAL 11.0
)
message
(
STATUS
"Add DCUDA11_MODE"
)
message
(
STATUS
"Add DCUDA11_MODE"
)
...
@@ -158,10 +167,14 @@ if(${TRITON_ENABLE_GPU})
...
@@ -158,10 +167,14 @@ if(${TRITON_ENABLE_GPU})
)
)
endif
()
# TRITON_ENABLE_GPU
endif
()
# TRITON_ENABLE_GPU
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fPIC"
)
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
-fPIC"
)
set_target_properties
(
set_target_properties
(
triton-turbomind-backend
triton-turbomind-backend
PROPERTIES
PROPERTIES
POSITION_INDEPENDENT_CODE ON
# POSITION_INDEPENDENT_CODE ON
POSITION_INDEPENDENT_CODE OFF
OUTPUT_NAME triton_turbomind
OUTPUT_NAME triton_turbomind
SKIP_BUILD_RPATH TRUE
SKIP_BUILD_RPATH TRUE
BUILD_WITH_INSTALL_RPATH TRUE
BUILD_WITH_INSTALL_RPATH TRUE
...
@@ -194,7 +207,7 @@ target_link_libraries(
...
@@ -194,7 +207,7 @@ target_link_libraries(
transformer-shared
# from repo-ft
transformer-shared
# from repo-ft
${
TRITON_PYTORCH_LDFLAGS
}
${
TRITON_PYTORCH_LDFLAGS
}
-lcublas
-lcublas
-lcublasLt
#
-lcublasLt
-lcudart
-lcudart
-lcurand
-lcurand
)
)
...
@@ -228,7 +241,8 @@ if(${TRITON_ENABLE_GPU})
...
@@ -228,7 +241,8 @@ if(${TRITON_ENABLE_GPU})
target_link_libraries
(
target_link_libraries
(
triton-turbomind-backend
triton-turbomind-backend
PRIVATE
PRIVATE
CUDA::cudart
# CUDA::cudart
cudart
)
)
endif
()
# TRITON_ENABLE_GPU
endif
()
# TRITON_ENABLE_GPU
...
...
src/turbomind/triton_backend/llama/CMakeLists.txt
View file @
9484fd1c
...
@@ -22,8 +22,10 @@ set(llama_triton_backend_files
...
@@ -22,8 +22,10 @@ set(llama_triton_backend_files
LlamaTritonModelInstance.cc
LlamaTritonModelInstance.cc
)
)
find_package
(
CUDAToolkit REQUIRED
)
#find_package(CUDAToolkit REQUIRED)
find_package
(
CUDA REQUIRED
)
add_library
(
LlamaTritonBackend STATIC
${
llama_triton_backend_files
}
)
add_library
(
LlamaTritonBackend STATIC
${
llama_triton_backend_files
}
)
set_property
(
TARGET LlamaTritonBackend PROPERTY POSITION_INDEPENDENT_CODE ON
)
#set_property(TARGET LlamaTritonBackend PROPERTY POSITION_INDEPENDENT_CODE ON)
target_link_libraries
(
LlamaTritonBackend PUBLIC TransformerTritonBackend Llama tensor memory_utils CUDA::cublasLt
)
#target_link_libraries(LlamaTritonBackend PUBLIC TransformerTritonBackend Llama tensor memory_utils CUDA::cublasLt)
target_link_libraries
(
LlamaTritonBackend PUBLIC TransformerTritonBackend Llama tensor memory_utils
)
target_compile_features
(
LlamaTritonBackend PRIVATE cxx_std_14
)
target_compile_features
(
LlamaTritonBackend PRIVATE cxx_std_14
)
src/turbomind/triton_backend/llama/LlamaTritonModel.cc
View file @
9484fd1c
...
@@ -258,7 +258,7 @@ std::unique_ptr<LlamaTritonSharedModelInstance<T>> LlamaTritonModel<T>::createSh
...
@@ -258,7 +258,7 @@ std::unique_ptr<LlamaTritonSharedModelInstance<T>> LlamaTritonModel<T>::createSh
cublasLtHandle_t
cublaslt_handle
;
cublasLtHandle_t
cublaslt_handle
;
cublasCreate
(
&
cublas_handle
);
cublasCreate
(
&
cublas_handle
);
cublasLtCreate
(
&
cublaslt_handle
);
//
cublasLtCreate(&cublaslt_handle);
cublasSetStream
(
cublas_handle
,
stream
);
cublasSetStream
(
cublas_handle
,
stream
);
std
::
unique_ptr
<
ft
::
cublasAlgoMap
>
cublas_algo_map
(
new
ft
::
cublasAlgoMap
(
"gemm_config.in"
));
std
::
unique_ptr
<
ft
::
cublasAlgoMap
>
cublas_algo_map
(
new
ft
::
cublasAlgoMap
(
"gemm_config.in"
));
...
@@ -270,7 +270,8 @@ std::unique_ptr<LlamaTritonSharedModelInstance<T>> LlamaTritonModel<T>::createSh
...
@@ -270,7 +270,8 @@ std::unique_ptr<LlamaTritonSharedModelInstance<T>> LlamaTritonModel<T>::createSh
ft
::
check_cuda_error
(
cudaGetDeviceProperties
(
cuda_device_prop_ptr
.
get
(),
device_id
));
ft
::
check_cuda_error
(
cudaGetDeviceProperties
(
cuda_device_prop_ptr
.
get
(),
device_id
));
if
(
std
::
is_same
<
T
,
half
>::
value
)
{
if
(
std
::
is_same
<
T
,
half
>::
value
)
{
cublas_wrapper
->
setGemmConfig
(
CUDA_R_16F
,
CUDA_R_16F
,
CUDA_R_16F
,
CUDA_R_32F
);
// cublas_wrapper->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F);
cublas_wrapper
->
setGemmConfig
(
CUDA_R_16F
,
CUDA_R_16F
,
CUDA_R_16F
,
CUDA_R_16F
);
}
}
else
if
(
std
::
is_same
<
T
,
float
>::
value
)
{
else
if
(
std
::
is_same
<
T
,
float
>::
value
)
{
cublas_wrapper
->
setFP32GemmConfig
();
cublas_wrapper
->
setFP32GemmConfig
();
...
...
src/turbomind/utils/CMakeLists.txt
View file @
9484fd1c
...
@@ -14,98 +14,104 @@
...
@@ -14,98 +14,104 @@
cmake_minimum_required
(
VERSION 3.8
)
cmake_minimum_required
(
VERSION 3.8
)
find_package
(
CUDAToolkit REQUIRED
)
#find_package(CUDAToolkit REQUIRED)
find_package
(
CUDA REQUIRED
)
add_subdirectory
(
gemm_test
)
add_subdirectory
(
gemm_test
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fPIC"
)
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
-fPIC"
)
add_library
(
cuda_utils STATIC cuda_utils.cc
)
add_library
(
cuda_utils STATIC cuda_utils.cc
)
set_property
(
TARGET cuda_utils PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET cuda_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET cuda_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET cuda_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
cuda_utils PUBLIC
CUDA::
cudart
)
target_link_libraries
(
cuda_utils PUBLIC cudart
)
add_library
(
logger STATIC logger.cc
)
add_library
(
logger STATIC logger.cc
)
set_property
(
TARGET logger PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET logger PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET logger PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET logger PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
logger PUBLIC
CUDA::
cudart
)
target_link_libraries
(
logger PUBLIC cudart
)
add_library
(
cublasAlgoMap STATIC cublasAlgoMap.cc
)
add_library
(
cublasAlgoMap STATIC cublasAlgoMap.cc
)
set_property
(
TARGET cublasAlgoMap PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET cublasAlgoMap PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET cublasAlgoMap PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET cublasAlgoMap PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
cublasAlgoMap PUBLIC
CUDA::
cublas
CUDA::
cudart
CUDA::
curand cuda_utils logger
)
target_link_libraries
(
cublasAlgoMap PUBLIC cublas cudart curand cuda_utils logger
)
add_library
(
cublasMMWrapper STATIC cublasMMWrapper.cc
)
add_library
(
cublasMMWrapper STATIC cublasMMWrapper.cc
)
set_property
(
TARGET cublasMMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET cublasMMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET cublasMMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET cublasMMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
cublasMMWrapper PUBLIC
CUDA::
cublas
CUDA::
cudart
CUDA::
curand cublasAlgoMap cuda_utils logger
)
target_link_libraries
(
cublasMMWrapper PUBLIC cublas cudart curand cublasAlgoMap cuda_utils logger
)
if
(
SPARSITY_SUPPORT
)
if
(
SPARSITY_SUPPORT
)
target_link_libraries
(
cublasMMWrapper PUBLIC
CUDA::
cusparse -lcusparseLt
)
target_link_libraries
(
cublasMMWrapper PUBLIC cusparse -lcusparseLt
)
endif
()
endif
()
add_library
(
word_list STATIC word_list.cc
)
add_library
(
word_list STATIC word_list.cc
)
set_property
(
TARGET word_list PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET word_list PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET word_list PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET word_list PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
nvtx_utils STATIC nvtx_utils.cc
)
add_library
(
nvtx_utils STATIC nvtx_utils.cc
)
set_property
(
TARGET nvtx_utils PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET nvtx_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET nvtx_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET nvtx_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
if
(
${
CMAKE_VERSION
}
VERSION_LESS
"3.25"
)
if
(
${
CMAKE_VERSION
}
VERSION_LESS
"3.25"
)
target_link_libraries
(
nvtx_utils PUBLIC
CUDA::
nvToolsExt -ldl
)
#
target_link_libraries(nvtx_utils PUBLIC nvToolsExt -ldl)
else
()
else
()
target_link_libraries
(
nvtx_utils PUBLIC
CUDA::
nvtx3 -ldl
)
#
target_link_libraries(nvtx_utils PUBLIC nvtx3 -ldl)
endif
()
endif
()
add_library
(
memory_utils STATIC memory_utils.cu
)
add_library
(
memory_utils STATIC memory_utils.cu
)
set_property
(
TARGET memory_utils PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET memory_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET memory_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET memory_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
memory_utils PUBLIC cuda_utils logger tensor
)
target_link_libraries
(
memory_utils PUBLIC cuda_utils logger tensor
)
add_library
(
mpi_utils STATIC mpi_utils.cc
)
add_library
(
mpi_utils STATIC mpi_utils.cc
)
set_property
(
TARGET mpi_utils PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET mpi_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET mpi_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET mpi_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
if
(
BUILD_MULTI_GPU
)
if
(
BUILD_MULTI_GPU
)
target_link_libraries
(
mpi_utils PUBLIC mpi logger
)
target_link_libraries
(
mpi_utils PUBLIC mpi logger
)
endif
()
endif
()
add_library
(
nccl_utils STATIC nccl_utils.cc
)
add_library
(
nccl_utils STATIC nccl_utils.cc
)
set_property
(
TARGET nccl_utils PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET nccl_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET nccl_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET nccl_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
if
(
BUILD_MULTI_GPU
)
if
(
BUILD_MULTI_GPU
)
target_link_libraries
(
nccl_utils PUBLIC
${
NCCL_LIBRARIES
}
logger
)
target_link_libraries
(
nccl_utils PUBLIC
${
NCCL_LIBRARIES
}
logger
)
endif
()
endif
()
add_library
(
cublasINT8MMWrapper STATIC cublasINT8MMWrapper.cc
)
#
add_library(cublasINT8MMWrapper STATIC cublasINT8MMWrapper.cc)
set_property
(
TARGET cublasINT8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET cublasINT8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET cublasINT8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET cublasINT8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
cublasINT8MMWrapper PUBLIC
CUDA::
cublasLt
CUDA::
cudart
CUDA::
curand cublasAlgoMap cublasMMWrapper cuda_utils logger
)
#
target_link_libraries(cublasINT8MMWrapper PUBLIC cublasLt cudart curand cublasAlgoMap cublasMMWrapper cuda_utils logger)
if
(
ENABLE_FP8
)
if
(
ENABLE_FP8
)
add_library
(
cublasFP8MMWrapper STATIC cublasFP8MMWrapper.cu
)
add_library
(
cublasFP8MMWrapper STATIC cublasFP8MMWrapper.cu
)
set_property
(
TARGET cublasFP8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON
)
#set_property(TARGET cublasFP8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET cublasFP8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#set_property(TARGET cublasFP8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
cublasFP8MMWrapper PUBLIC CUDA::cublasLt CUDA::cudart CUDA::curand
#target_link_libraries(cublasFP8MMWrapper PUBLIC cublasLt cudart curand
target_link_libraries
(
cublasFP8MMWrapper PUBLIC cudart curand
cublasAlgoMap cublasMMWrapper nvtx_utils fp8_qgmma_1x1_utils
)
cublasAlgoMap cublasMMWrapper nvtx_utils fp8_qgmma_1x1_utils
)
endif
()
endif
()
add_library
(
custom_ar_comm STATIC custom_ar_comm.cc
)
add_library
(
custom_ar_comm STATIC custom_ar_comm.cc
)
set_property
(
TARGET custom_ar_comm PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET custom_ar_comm PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET custom_ar_comm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET custom_ar_comm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
custom_ar_comm PUBLIC custom_ar_kernels memory_utils cuda_utils logger
)
target_link_libraries
(
custom_ar_comm PUBLIC custom_ar_kernels memory_utils cuda_utils logger
)
add_library
(
gemm STATIC gemm.cc
)
add_library
(
gemm STATIC gemm.cc
)
set_property
(
TARGET gemm PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET gemm PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET gemm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET gemm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
gemm PUBLIC
target_link_libraries
(
gemm PUBLIC
CUDA::cublas CUDA::cublasLt CUDA::cudart CUDA::curand
# cublas cublasLt cudart curand
cublas cudart curand
cublasAlgoMap memory_utils cuda_utils logger
)
cublasAlgoMap memory_utils cuda_utils logger
)
if
(
SPARSITY_SUPPORT
)
if
(
SPARSITY_SUPPORT
)
target_link_libraries
(
gemm PUBLIC
CUDA::
cusparse -lcusparseLt
)
target_link_libraries
(
gemm PUBLIC cusparse -lcusparseLt
)
endif
()
endif
()
add_library
(
cuda_fp8_utils STATIC cuda_fp8_utils.cu
)
#
add_library(cuda_fp8_utils STATIC cuda_fp8_utils.cu)
set_property
(
TARGET cuda_fp8_utils PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET cuda_fp8_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET cuda_fp8_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET cuda_fp8_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
tensor STATIC Tensor.cc
)
add_library
(
tensor STATIC Tensor.cc
)
set_property
(
TARGET tensor PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET tensor PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET tensor PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET tensor PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
tensor PUBLIC cuda_utils logger
)
target_link_libraries
(
tensor PUBLIC cuda_utils logger
)
src/turbomind/utils/allocator.h
View file @
9484fd1c
...
@@ -44,9 +44,9 @@
...
@@ -44,9 +44,9 @@
#include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/logger.h"
#if defined(CUDART_VERSION) && CUDART_VERSION < 11020
//
#if defined(CUDART_VERSION) && CUDART_VERSION < 11020
#define CUDA_MEMORY_POOL_DISABLED
#define CUDA_MEMORY_POOL_DISABLED
#endif
//
#endif
namespace
turbomind
{
namespace
turbomind
{
...
@@ -158,36 +158,36 @@ public:
...
@@ -158,36 +158,36 @@ public:
{
{
TM_LOG_DEBUG
(
__PRETTY_FUNCTION__
);
TM_LOG_DEBUG
(
__PRETTY_FUNCTION__
);
pointer_mapping_
=
new
std
::
unordered_map
<
void
*
,
std
::
pair
<
size_t
,
MemoryType
>>
();
pointer_mapping_
=
new
std
::
unordered_map
<
void
*
,
std
::
pair
<
size_t
,
MemoryType
>>
();
#if defined(CUDA_MEMORY_POOL_DISABLED)
//
#if defined(CUDA_MEMORY_POOL_DISABLED)
TM_LOG_WARNING
(
//
TM_LOG_WARNING(
"Async cudaMalloc/Free is not supported before CUDA 11.2. Using Sync cudaMalloc/Free."
//
"Async cudaMalloc/Free is not supported before CUDA 11.2. Using Sync cudaMalloc/Free."
"Note this may lead to hang with NCCL kernels launched in parallel; if so, try NCCL_LAUNCH_MODE=GROUP"
);
//
"Note this may lead to hang with NCCL kernels launched in parallel; if so, try NCCL_LAUNCH_MODE=GROUP");
#else
//
#else
int
device_count
=
1
;
//
int device_count = 1;
check_cuda_error
(
cudaGetDeviceCount
(
&
device_count
));
//
check_cuda_error(cudaGetDeviceCount(&device_count));
cudaMemPool_t
mempool
;
//
cudaMemPool_t mempool;
check_cuda_error
(
cudaDeviceGetDefaultMemPool
(
&
mempool
,
device_id
));
//
check_cuda_error(cudaDeviceGetDefaultMemPool(&mempool, device_id));
cudaMemAccessDesc
desc
=
{};
//
cudaMemAccessDesc desc = {};
int
peer_access_available
=
0
;
//
int peer_access_available = 0;
for
(
int
i
=
0
;
i
<
device_count
;
i
++
)
{
//
for (int i = 0; i < device_count; i++) {
if
(
i
==
device_id
)
{
//
if (i == device_id) {
continue
;
//
continue;
}
//
}
check_cuda_error
(
cudaDeviceCanAccessPeer
(
&
peer_access_available
,
device_id
,
i
));
//
check_cuda_error(cudaDeviceCanAccessPeer(&peer_access_available, device_id, i));
if
(
!
peer_access_available
)
{
//
if (!peer_access_available) {
TM_LOG_WARNING
(
"Device "
+
std
::
to_string
(
device_id
)
+
" peer access Device "
+
std
::
to_string
(
i
)
//
TM_LOG_WARNING("Device " + std::to_string(device_id) + " peer access Device " + std::to_string(i)
+
" is not available."
);
//
+ " is not available.");
continue
;
//
continue;
}
//
}
desc
.
location
.
type
=
cudaMemLocationTypeDevice
;
//
desc.location.type = cudaMemLocationTypeDevice;
desc
.
location
.
id
=
i
;
//
desc.location.id = i;
desc
.
flags
=
cudaMemAccessFlagsProtReadWrite
;
//
desc.flags = cudaMemAccessFlagsProtReadWrite;
check_cuda_error
(
cudaMemPoolSetAccess
(
mempool
,
&
desc
,
1
));
//
check_cuda_error(cudaMemPoolSetAccess(mempool, &desc, 1));
}
//
}
// set memory pool threshold to avoid shrinking the pool
//
// set memory pool threshold to avoid shrinking the pool
uint64_t
setVal
=
UINT64_MAX
;
//
uint64_t setVal = UINT64_MAX;
check_cuda_error
(
cudaMemPoolSetAttribute
(
mempool
,
cudaMemPoolAttrReleaseThreshold
,
&
setVal
));
//
check_cuda_error(cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &setVal));
#endif
//
#endif
}
}
virtual
~
Allocator
()
virtual
~
Allocator
()
...
...
src/turbomind/utils/cublasAlgoMap.cc
View file @
9484fd1c
...
@@ -139,7 +139,8 @@ cublasAlgoMap::getAlgo(const int batch_count, const int m, const int n, const in
...
@@ -139,7 +139,8 @@ cublasAlgoMap::getAlgo(const int batch_count, const int m, const int n, const in
else
{
else
{
cublasLtMatmulAlgo_info
tmp_algo
;
cublasLtMatmulAlgo_info
tmp_algo
;
tmp_algo
.
algoId
=
tmp_algo
.
algoId
=
static_cast
<
int
>
(
data_type
==
FLOAT_DATATYPE
?
CUBLAS_GEMM_DEFAULT
:
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
// static_cast<int>(data_type == FLOAT_DATATYPE ? CUBLAS_GEMM_DEFAULT : CUBLAS_GEMM_DEFAULT_TENSOR_OP);
static_cast
<
int
>
(
data_type
==
FLOAT_DATATYPE
?
CUBLAS_GEMM_DEFAULT
:
CUBLAS_GEMM_DEFAULT
);
tmp_algo
.
customOption
=
-
1
;
tmp_algo
.
customOption
=
-
1
;
tmp_algo
.
tile
=
-
1
;
tmp_algo
.
tile
=
-
1
;
tmp_algo
.
splitK_val
=
-
1
;
tmp_algo
.
splitK_val
=
-
1
;
...
...
src/turbomind/utils/cublasFP8MMWrapper.cu
View file @
9484fd1c
...
@@ -237,10 +237,10 @@ void cublasFP8MMWrapper::Gemm(__nv_bfloat16* res,
...
@@ -237,10 +237,10 @@ void cublasFP8MMWrapper::Gemm(__nv_bfloat16* res,
cublasLtMatmulAlgoConfigSetAttribute
(
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME
,
&
(
info
.
reductionScheme
),
sizeof
(
info
.
reductionScheme
));
&
algo
,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME
,
&
(
info
.
reductionScheme
),
sizeof
(
info
.
reductionScheme
));
#if (CUDART_VERSION >= 11000)
//
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoConfigSetAttribute
(
//
cublasLtMatmulAlgoConfigSetAttribute(
&
algo
,
CUBLASLT_ALGO_CONFIG_STAGES_ID
,
&
(
info
.
stages
),
sizeof
(
info
.
stages
));
//
&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
#endif
//
#endif
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
cublasLtMatmulAlgoConfigSetAttribute
(
cublasLtMatmulAlgoConfigSetAttribute
(
...
@@ -462,10 +462,10 @@ void cublasFP8MMWrapper::Gemm(__nv_fp8_e4m3* res,
...
@@ -462,10 +462,10 @@ void cublasFP8MMWrapper::Gemm(__nv_fp8_e4m3* res,
cublasLtMatmulAlgoConfigSetAttribute
(
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME
,
&
(
info
.
reductionScheme
),
sizeof
(
info
.
reductionScheme
));
&
algo
,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME
,
&
(
info
.
reductionScheme
),
sizeof
(
info
.
reductionScheme
));
#if (CUDART_VERSION >= 11000)
//
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoConfigSetAttribute
(
//
cublasLtMatmulAlgoConfigSetAttribute(
&
algo
,
CUBLASLT_ALGO_CONFIG_STAGES_ID
,
&
(
info
.
stages
),
sizeof
(
info
.
stages
));
//
&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
#endif
//
#endif
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
cublasLtMatmulAlgoConfigSetAttribute
(
cublasLtMatmulAlgoConfigSetAttribute
(
...
...
src/turbomind/utils/cublasINT8MMWrapper.cc
View file @
9484fd1c
...
@@ -94,11 +94,11 @@ void cublasINT8MMWrapper::Gemm(int* res,
...
@@ -94,11 +94,11 @@ void cublasINT8MMWrapper::Gemm(int* res,
{
{
mu_
->
lock
();
mu_
->
lock
();
cublasOperation_t
opTranspose
=
CUBLAS_OP_T
;
cublasOperation_t
opTranspose
=
CUBLAS_OP_T
;
#if (CUDART_VERSION >= 11000)
//
#if (CUDART_VERSION >= 11000)
cublasComputeType_t
computeType
=
CUBLAS_COMPUTE_32I
;
//
cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
#else
//
#else
cudaDataType_t
computeType
=
CUDA_R_32I
;
cudaDataType_t
computeType
=
CUDA_R_32I
;
#endif
//
#endif
cublasLtMatmulDesc_t
matmulDesc
;
cublasLtMatmulDesc_t
matmulDesc
;
cublasLtMatrixLayout_t
AtransformDesc
=
NULL
;
cublasLtMatrixLayout_t
AtransformDesc
=
NULL
;
cublasLtMatrixLayout_t
BtransformDesc
=
NULL
;
cublasLtMatrixLayout_t
BtransformDesc
=
NULL
;
...
@@ -106,16 +106,16 @@ void cublasINT8MMWrapper::Gemm(int* res,
...
@@ -106,16 +106,16 @@ void cublasINT8MMWrapper::Gemm(int* res,
cublasLtOrder_t
order_COL32
=
CUBLASLT_ORDER_COL32
;
cublasLtOrder_t
order_COL32
=
CUBLASLT_ORDER_COL32
;
cublasLtOrder_t
order_matrixB
;
cublasLtOrder_t
order_matrixB
;
#if (CUDART_VERSION >= 11000)
// #if (CUDART_VERSION >= 11000)
if
(
use_ORDER_COL32_2R_4R4_
)
{
// if (use_ORDER_COL32_2R_4R4_) {
order_matrixB
=
CUBLASLT_ORDER_COL32_2R_4R4
;
// order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
}
// }
else
{
// else {
// order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
// }
// #else
order_matrixB
=
CUBLASLT_ORDER_COL4_4R2_8C
;
order_matrixB
=
CUBLASLT_ORDER_COL4_4R2_8C
;
}
// #endif
#else
order_matrixB
=
CUBLASLT_ORDER_COL4_4R2_8C
;
#endif
int
ldaTransform
=
32
*
m
;
int
ldaTransform
=
32
*
m
;
int
ldbTransform
;
int
ldbTransform
;
...
@@ -128,11 +128,11 @@ void cublasINT8MMWrapper::Gemm(int* res,
...
@@ -128,11 +128,11 @@ void cublasINT8MMWrapper::Gemm(int* res,
int
ldcTransform
=
32
*
m
;
int
ldcTransform
=
32
*
m
;
// create matmulDesc
// create matmulDesc
#if (CUDART_VERSION >= 11000)
//
#if (CUDART_VERSION >= 11000)
cublasLtMatmulDescCreate
(
&
matmulDesc
,
computeType
,
CUDA_R_32I
);
//
cublasLtMatmulDescCreate(&matmulDesc, computeType, CUDA_R_32I);
#else
//
#else
cublasLtMatmulDescCreate
(
&
matmulDesc
,
computeType
);
cublasLtMatmulDescCreate
(
&
matmulDesc
,
computeType
);
#endif
//
#endif
cublasLtMatmulDescSetAttribute
(
matmulDesc
,
CUBLASLT_MATMUL_DESC_TRANSB
,
&
opTranspose
,
sizeof
(
cublasOperation_t
));
cublasLtMatmulDescSetAttribute
(
matmulDesc
,
CUBLASLT_MATMUL_DESC_TRANSB
,
&
opTranspose
,
sizeof
(
cublasOperation_t
));
cublasLtMatrixLayoutCreate
(
&
AtransformDesc
,
CUDA_R_8I
,
m
,
k
,
ldaTransform
);
cublasLtMatrixLayoutCreate
(
&
AtransformDesc
,
CUDA_R_8I
,
m
,
k
,
ldaTransform
);
cublasLtMatrixLayoutSetAttribute
(
AtransformDesc
,
CUBLASLT_MATRIX_LAYOUT_ORDER
,
&
order_COL32
,
sizeof
(
order_COL32
));
cublasLtMatrixLayoutSetAttribute
(
AtransformDesc
,
CUBLASLT_MATRIX_LAYOUT_ORDER
,
&
order_COL32
,
sizeof
(
order_COL32
));
...
@@ -187,10 +187,10 @@ void cublasINT8MMWrapper::Gemm(int* res,
...
@@ -187,10 +187,10 @@ void cublasINT8MMWrapper::Gemm(int* res,
&
algo
,
CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING
,
&
(
tmp_info
.
swizzle
),
sizeof
(
tmp_info
.
swizzle
));
&
algo
,
CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING
,
&
(
tmp_info
.
swizzle
),
sizeof
(
tmp_info
.
swizzle
));
cublasLtMatmulAlgoConfigSetAttribute
(
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME
,
&
(
tmp_info
.
reductionScheme
),
sizeof
(
int
));
&
algo
,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME
,
&
(
tmp_info
.
reductionScheme
),
sizeof
(
int
));
#if (CUDART_VERSION >= 11000)
//
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoConfigSetAttribute
(
//
cublasLtMatmulAlgoConfigSetAttribute(
&
algo
,
CUBLASLT_ALGO_CONFIG_STAGES_ID
,
&
(
tmp_info
.
stages
),
sizeof
(
tmp_info
.
stages
));
//
&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(tmp_info.stages), sizeof(tmp_info.stages));
#endif
//
#endif
}
}
else
{
else
{
findAlgo
=
1
;
findAlgo
=
1
;
...
@@ -215,16 +215,16 @@ void cublasINT8MMWrapper::Gemm(int* res,
...
@@ -215,16 +215,16 @@ void cublasINT8MMWrapper::Gemm(int* res,
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING
,
&
(
swizzle
),
sizeof
(
swizzle
));
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING
,
&
(
swizzle
),
sizeof
(
swizzle
));
cublasLtMatmulAlgoConfigSetAttribute
(
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME
,
&
(
reductionScheme
),
sizeof
(
int
));
&
algo
,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME
,
&
(
reductionScheme
),
sizeof
(
int
));
#if (CUDART_VERSION >= 11000)
//
#if (CUDART_VERSION >= 11000)
int
stages
;
//
int stages;
if
(
use_ORDER_COL32_2R_4R4_
)
{
//
if (use_ORDER_COL32_2R_4R4_) {
stages
=
15
;
//
stages = 15;
}
//
}
else
{
//
else {
stages
=
13
;
//
stages = 13;
}
//
}
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_STAGES_ID
,
&
(
stages
),
sizeof
(
stages
));
//
cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(stages), sizeof(stages));
#endif
//
#endif
}
}
cublasLtMatmul
(
cublaslt_handle_
,
cublasLtMatmul
(
cublaslt_handle_
,
...
@@ -273,11 +273,11 @@ void cublasINT8MMWrapper::Gemm(int8_t* res,
...
@@ -273,11 +273,11 @@ void cublasINT8MMWrapper::Gemm(int8_t* res,
// int8 gemm does not support CUBLAS_POINTER_MODE_DEVICE
// int8 gemm does not support CUBLAS_POINTER_MODE_DEVICE
// cublasLtPointerMode_t pointerMode = CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO;
// cublasLtPointerMode_t pointerMode = CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO;
cudaDataType_t
scaleType
=
CUDA_R_32F
;
cudaDataType_t
scaleType
=
CUDA_R_32F
;
#if (CUDART_VERSION >= 11000)
//
#if (CUDART_VERSION >= 11000)
cublasComputeType_t
computeType
=
CUBLAS_COMPUTE_32I
;
//
cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
#else
//
#else
cudaDataType_t
computeType
=
CUDA_R_32I
;
cudaDataType_t
computeType
=
CUDA_R_32I
;
#endif
//
#endif
cublasLtMatmulDesc_t
matmulDesc
;
cublasLtMatmulDesc_t
matmulDesc
;
cublasLtMatrixLayout_t
AtransformDesc
=
NULL
;
cublasLtMatrixLayout_t
AtransformDesc
=
NULL
;
cublasLtMatrixLayout_t
BtransformDesc
=
NULL
;
cublasLtMatrixLayout_t
BtransformDesc
=
NULL
;
...
@@ -285,16 +285,16 @@ void cublasINT8MMWrapper::Gemm(int8_t* res,
...
@@ -285,16 +285,16 @@ void cublasINT8MMWrapper::Gemm(int8_t* res,
cublasLtOrder_t
order_COL32
=
CUBLASLT_ORDER_COL32
;
cublasLtOrder_t
order_COL32
=
CUBLASLT_ORDER_COL32
;
cublasLtOrder_t
order_matrixB
;
cublasLtOrder_t
order_matrixB
;
#if (CUDART_VERSION >= 11000)
// #if (CUDART_VERSION >= 11000)
if
(
use_ORDER_COL32_2R_4R4_
)
{
// if (use_ORDER_COL32_2R_4R4_) {
order_matrixB
=
CUBLASLT_ORDER_COL32_2R_4R4
;
// order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
}
// }
else
{
// else {
// order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
// }
// #else
order_matrixB
=
CUBLASLT_ORDER_COL4_4R2_8C
;
order_matrixB
=
CUBLASLT_ORDER_COL4_4R2_8C
;
}
// #endif
#else
order_matrixB
=
CUBLASLT_ORDER_COL4_4R2_8C
;
#endif
int
ldaTransform
=
32
*
m
;
int
ldaTransform
=
32
*
m
;
...
@@ -309,11 +309,11 @@ void cublasINT8MMWrapper::Gemm(int8_t* res,
...
@@ -309,11 +309,11 @@ void cublasINT8MMWrapper::Gemm(int8_t* res,
int
ldcTransform
=
32
*
m
;
int
ldcTransform
=
32
*
m
;
// create matmulDesc
// create matmulDesc
#if (CUDART_VERSION >= 11000)
//
#if (CUDART_VERSION >= 11000)
cublasLtMatmulDescCreate
(
&
matmulDesc
,
computeType
,
scaleType
);
//
cublasLtMatmulDescCreate(&matmulDesc, computeType, scaleType);
#else
//
#else
cublasLtMatmulDescCreate
(
&
matmulDesc
,
computeType
);
cublasLtMatmulDescCreate
(
&
matmulDesc
,
computeType
);
#endif
//
#endif
cublasLtMatmulDescSetAttribute
(
matmulDesc
,
CUBLASLT_MATMUL_DESC_TRANSB
,
&
opTranspose
,
sizeof
(
cublasOperation_t
));
cublasLtMatmulDescSetAttribute
(
matmulDesc
,
CUBLASLT_MATMUL_DESC_TRANSB
,
&
opTranspose
,
sizeof
(
cublasOperation_t
));
cublasLtMatmulDescSetAttribute
(
matmulDesc
,
CUBLASLT_MATMUL_DESC_SCALE_TYPE
,
&
scaleType
,
sizeof
(
scaleType
));
cublasLtMatmulDescSetAttribute
(
matmulDesc
,
CUBLASLT_MATMUL_DESC_SCALE_TYPE
,
&
scaleType
,
sizeof
(
scaleType
));
// cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointerMode,
// cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointerMode,
...
@@ -367,10 +367,10 @@ void cublasINT8MMWrapper::Gemm(int8_t* res,
...
@@ -367,10 +367,10 @@ void cublasINT8MMWrapper::Gemm(int8_t* res,
&
algo
,
CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING
,
&
(
tmp_info
.
swizzle
),
sizeof
(
tmp_info
.
swizzle
));
&
algo
,
CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING
,
&
(
tmp_info
.
swizzle
),
sizeof
(
tmp_info
.
swizzle
));
cublasLtMatmulAlgoConfigSetAttribute
(
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME
,
&
(
tmp_info
.
reductionScheme
),
sizeof
(
int
));
&
algo
,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME
,
&
(
tmp_info
.
reductionScheme
),
sizeof
(
int
));
#if (CUDART_VERSION >= 11000)
//
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoConfigSetAttribute
(
//
cublasLtMatmulAlgoConfigSetAttribute(
&
algo
,
CUBLASLT_ALGO_CONFIG_STAGES_ID
,
&
(
tmp_info
.
stages
),
sizeof
(
tmp_info
.
stages
));
//
&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(tmp_info.stages), sizeof(tmp_info.stages));
#endif
//
#endif
}
}
else
{
else
{
findAlgo
=
1
;
findAlgo
=
1
;
...
@@ -395,16 +395,16 @@ void cublasINT8MMWrapper::Gemm(int8_t* res,
...
@@ -395,16 +395,16 @@ void cublasINT8MMWrapper::Gemm(int8_t* res,
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING
,
&
(
swizzle
),
sizeof
(
swizzle
));
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING
,
&
(
swizzle
),
sizeof
(
swizzle
));
cublasLtMatmulAlgoConfigSetAttribute
(
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME
,
&
(
reductionScheme
),
sizeof
(
int
));
&
algo
,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME
,
&
(
reductionScheme
),
sizeof
(
int
));
#if (CUDART_VERSION >= 11000)
//
#if (CUDART_VERSION >= 11000)
int
stages
;
//
int stages;
if
(
use_ORDER_COL32_2R_4R4_
)
{
//
if (use_ORDER_COL32_2R_4R4_) {
stages
=
15
;
//
stages = 15;
}
//
}
else
{
//
else {
stages
=
13
;
//
stages = 13;
}
//
}
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_STAGES_ID
,
&
(
stages
),
sizeof
(
stages
));
//
cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(stages), sizeof(stages));
#endif
//
#endif
}
}
float
beta
=
0.0
f
;
float
beta
=
0.0
f
;
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment