Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
3ee62235
"tests/python/vscode:/vscode.git/clone" did not exist on "af0b63eda6315dcd0db6bdb25f2db84a0db12754"
Unverified
Commit
3ee62235
authored
Jan 31, 2025
by
Yineng Zhang
Committed by
GitHub
Jan 31, 2025
Browse files
revert the MoE dependence (#3230)
parent
9829e77e
Changes
94
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
4657 deletions
+0
-4657
sgl-kernel/3rdparty/tensorrt_llm/common/assert.cpp
sgl-kernel/3rdparty/tensorrt_llm/common/assert.cpp
+0
-34
sgl-kernel/3rdparty/tensorrt_llm/common/assert.h
sgl-kernel/3rdparty/tensorrt_llm/common/assert.h
+0
-92
sgl-kernel/3rdparty/tensorrt_llm/common/cublasMMWrapper.cpp
sgl-kernel/3rdparty/tensorrt_llm/common/cublasMMWrapper.cpp
+0
-360
sgl-kernel/3rdparty/tensorrt_llm/common/cublasMMWrapper.h
sgl-kernel/3rdparty/tensorrt_llm/common/cublasMMWrapper.h
+0
-148
sgl-kernel/3rdparty/tensorrt_llm/common/cublasVersionCheck.h
sgl-kernel/3rdparty/tensorrt_llm/common/cublasVersionCheck.h
+0
-35
sgl-kernel/3rdparty/tensorrt_llm/common/cudaBf16Fallbacks.cuh
...kernel/3rdparty/tensorrt_llm/common/cudaBf16Fallbacks.cuh
+0
-313
sgl-kernel/3rdparty/tensorrt_llm/common/cudaBf16Wrapper.h
sgl-kernel/3rdparty/tensorrt_llm/common/cudaBf16Wrapper.h
+0
-21
sgl-kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.cpp
...kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.cpp
+0
-187
sgl-kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.h
sgl-kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.h
+0
-138
sgl-kernel/3rdparty/tensorrt_llm/common/cudaFp8Utils.cu
sgl-kernel/3rdparty/tensorrt_llm/common/cudaFp8Utils.cu
+0
-436
sgl-kernel/3rdparty/tensorrt_llm/common/cudaFp8Utils.h
sgl-kernel/3rdparty/tensorrt_llm/common/cudaFp8Utils.h
+0
-239
sgl-kernel/3rdparty/tensorrt_llm/common/cudaTypeUtils.cuh
sgl-kernel/3rdparty/tensorrt_llm/common/cudaTypeUtils.cuh
+0
-752
sgl-kernel/3rdparty/tensorrt_llm/common/cudaUtils.h
sgl-kernel/3rdparty/tensorrt_llm/common/cudaUtils.h
+0
-641
sgl-kernel/3rdparty/tensorrt_llm/common/logger.cpp
sgl-kernel/3rdparty/tensorrt_llm/common/logger.cpp
+0
-70
sgl-kernel/3rdparty/tensorrt_llm/common/logger.h
sgl-kernel/3rdparty/tensorrt_llm/common/logger.h
+0
-190
sgl-kernel/3rdparty/tensorrt_llm/common/quantTypeUtils.cuh
sgl-kernel/3rdparty/tensorrt_llm/common/quantTypeUtils.cuh
+0
-55
sgl-kernel/3rdparty/tensorrt_llm/common/quantization.h
sgl-kernel/3rdparty/tensorrt_llm/common/quantization.h
+0
-358
sgl-kernel/3rdparty/tensorrt_llm/common/reduceKernelUtils.cuh
...kernel/3rdparty/tensorrt_llm/common/reduceKernelUtils.cuh
+0
-399
sgl-kernel/3rdparty/tensorrt_llm/common/stringUtils.cpp
sgl-kernel/3rdparty/tensorrt_llm/common/stringUtils.cpp
+0
-76
sgl-kernel/3rdparty/tensorrt_llm/common/stringUtils.h
sgl-kernel/3rdparty/tensorrt_llm/common/stringUtils.h
+0
-113
No files found.
sgl-kernel/3rdparty/tensorrt_llm/common/assert.cpp
deleted
100644 → 0
View file @
9829e77e
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "tensorrt_llm/common/assert.h"
namespace
{
bool
initCheckDebug
()
{
auto
constexpr
kDebugEnabled
=
"TLLM_DEBUG_MODE"
;
auto
const
debugEnabled
=
std
::
getenv
(
kDebugEnabled
);
return
debugEnabled
&&
debugEnabled
[
0
]
==
'1'
;
}
}
// namespace
bool
DebugConfig
::
isCheckDebugEnabled
()
{
static
bool
const
debugEnabled
=
initCheckDebug
();
return
debugEnabled
;
}
sgl-kernel/3rdparty/tensorrt_llm/common/assert.h
deleted
100644 → 0
View file @
9829e77e
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "tensorrt_llm/common/stringUtils.h"
#include "tensorrt_llm/common/tllmException.h"
#include <string>
namespace
tensorrt_llm
::
common
{
[[
noreturn
]]
inline
void
throwRuntimeError
(
char
const
*
const
file
,
int
const
line
,
std
::
string
const
&
info
=
""
)
{
throw
TllmException
(
file
,
line
,
fmtstr
(
"[TensorRT-LLM][ERROR] Assertion failed: %s"
,
info
.
c_str
()));
}
}
// namespace tensorrt_llm::common
class
DebugConfig
{
public:
static
bool
isCheckDebugEnabled
();
};
#if defined(_WIN32)
#define TLLM_LIKELY(x) (__assume((x) == 1), (x))
#define TLLM_UNLIKELY(x) (__assume((x) == 0), (x))
#else
#define TLLM_LIKELY(x) __builtin_expect((x), 1)
#define TLLM_UNLIKELY(x) __builtin_expect((x), 0)
#endif
#define TLLM_CHECK(val) \
do \
{ \
TLLM_LIKELY(static_cast<bool>(val)) ? ((void) 0) \
: tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \
} while (0)
#define TLLM_CHECK_WITH_INFO(val, info, ...) \
do \
{ \
TLLM_LIKELY(static_cast<bool>(val)) \
? ((void) 0) \
: tensorrt_llm::common::throwRuntimeError( \
__FILE__, __LINE__, tensorrt_llm::common::fmtstr(info, ##__VA_ARGS__)); \
} while (0)
#define TLLM_CHECK_DEBUG(val) \
do \
{ \
if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled())) \
{ \
TLLM_LIKELY(static_cast<bool>(val)) ? ((void) 0) \
: tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \
} \
} while (0)
#define TLLM_CHECK_DEBUG_WITH_INFO(val, info, ...) \
do \
{ \
if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled())) \
{ \
TLLM_LIKELY(static_cast<bool>(val)) \
? ((void) 0) \
: tensorrt_llm::common::throwRuntimeError( \
__FILE__, __LINE__, tensorrt_llm::common::fmtstr(info, ##__VA_ARGS__)); \
} \
} while (0)
#define TLLM_THROW(...) \
do \
{ \
throw NEW_TLLM_EXCEPTION(__VA_ARGS__); \
} while (0)
#define TLLM_WRAP(ex) \
NEW_TLLM_EXCEPTION("%s: %s", tensorrt_llm::common::TllmException::demangle(typeid(ex).name()).c_str(), ex.what())
sgl-kernel/3rdparty/tensorrt_llm/common/cublasMMWrapper.cpp
deleted
100644 → 0
View file @
9829e77e
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "tensorrt_llm/common/cublasMMWrapper.h"
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/cublasVersionCheck.h"
#include <algorithm>
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#endif
namespace
tensorrt_llm
{
namespace
common
{
CublasMMWrapper
::
CublasMMWrapper
(
std
::
shared_ptr
<
cublasHandle_t
>
cublasHandle
,
std
::
shared_ptr
<
cublasLtHandle_t
>
cublasltHandle
,
cudaStream_t
stream
,
void
*
workspace
)
:
mCublasHandle
(
cublasHandle
)
,
mCublasLtHandle
(
cublasltHandle
)
,
mStream
(
stream
)
,
mCublasWorkspace
(
workspace
)
{
}
CublasMMWrapper
::~
CublasMMWrapper
()
{}
CublasMMWrapper
::
CublasMMWrapper
(
CublasMMWrapper
const
&
wrapper
)
:
mCublasHandle
(
wrapper
.
mCublasHandle
)
,
mCublasLtHandle
(
wrapper
.
mCublasLtHandle
)
,
mStream
(
wrapper
.
mStream
)
{
}
void
CublasMMWrapper
::
createDescriptors
(
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
const
m
,
int
const
n
,
int
const
k
,
int
const
lda
,
int
const
ldb
,
int
const
ldc
,
int8_t
fastAcc
)
{
// --------------------------------------
// Create descriptors for the original matrices
check_cuda_error
(
cublasLtMatrixLayoutCreate
(
&
mADesc
,
mAType
,
transa
==
CUBLAS_OP_N
?
m
:
k
,
transa
==
CUBLAS_OP_N
?
k
:
m
,
lda
));
check_cuda_error
(
cublasLtMatrixLayoutCreate
(
&
mBDesc
,
mBType
,
transb
==
CUBLAS_OP_N
?
k
:
n
,
transb
==
CUBLAS_OP_N
?
n
:
k
,
ldb
));
check_cuda_error
(
cublasLtMatrixLayoutCreate
(
&
mCDesc
,
mCType
,
m
,
n
,
ldc
));
check_cuda_error
(
cublasLtMatmulDescCreate
(
&
mOperationDesc
,
mComputeType
,
mScaleType
));
check_cuda_error
(
cublasLtMatmulDescSetAttribute
(
mOperationDesc
,
CUBLASLT_MATMUL_DESC_TRANSA
,
&
transa
,
sizeof
(
cublasOperation_t
)));
check_cuda_error
(
cublasLtMatmulDescSetAttribute
(
mOperationDesc
,
CUBLASLT_MATMUL_DESC_TRANSB
,
&
transb
,
sizeof
(
cublasOperation_t
)));
check_cuda_error
(
cublasLtMatmulDescSetAttribute
(
mOperationDesc
,
CUBLASLT_MATMUL_DESC_FAST_ACCUM
,
&
fastAcc
,
sizeof
(
int8_t
)));
}
void
CublasMMWrapper
::
setScaleDescriptors
(
void
*
scale_a
,
void
*
scale_b
)
{
check_cuda_error
(
cublasLtMatmulDescSetAttribute
(
mOperationDesc
,
CUBLASLT_MATMUL_DESC_A_SCALE_POINTER
,
&
scale_a
,
sizeof
(
void
*
)));
check_cuda_error
(
cublasLtMatmulDescSetAttribute
(
mOperationDesc
,
CUBLASLT_MATMUL_DESC_B_SCALE_POINTER
,
&
scale_b
,
sizeof
(
void
*
)));
}
void
CublasMMWrapper
::
destroyDescriptors
()
{
check_cuda_error
(
cublasLtMatmulDescDestroy
(
mOperationDesc
));
check_cuda_error
(
cublasLtMatrixLayoutDestroy
(
mADesc
));
check_cuda_error
(
cublasLtMatrixLayoutDestroy
(
mBDesc
));
check_cuda_error
(
cublasLtMatrixLayoutDestroy
(
mCDesc
));
mOperationDesc
=
NULL
;
mADesc
=
NULL
;
mBDesc
=
NULL
;
mCDesc
=
NULL
;
}
void
CublasMMWrapper
::
Gemm
(
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
const
m
,
int
const
n
,
int
const
k
,
void
const
*
A
,
int
const
lda
,
void
const
*
B
,
int
const
ldb
,
void
*
C
,
int
const
ldc
)
{
Gemm
(
transa
,
transb
,
m
,
n
,
k
,
A
,
lda
,
B
,
ldb
,
C
,
ldc
,
1.0
f
,
0.0
f
);
}
void
CublasMMWrapper
::
Gemm
(
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
const
m
,
int
const
n
,
int
const
k
,
void
const
*
A
,
int
const
lda
,
void
const
*
B
,
int
const
ldb
,
void
*
C
,
int
const
ldc
,
std
::
optional
<
cublasLtMatmulHeuristicResult_t
>
const
&
heuristic
)
{
if
(
heuristic
)
{
Gemm
(
transa
,
transb
,
m
,
n
,
k
,
A
,
lda
,
B
,
ldb
,
C
,
ldc
,
1.0
f
,
0.0
f
,
/* hasAlgo */
(
*
heuristic
).
algo
,
(
*
heuristic
).
state
==
CUBLAS_STATUS_SUCCESS
&&
(
*
heuristic
).
workspaceSize
<
CUBLAS_WORKSPACE_SIZE
,
/* usingCublasLt */
true
);
}
else
{
Gemm
(
transa
,
transb
,
m
,
n
,
k
,
A
,
lda
,
B
,
ldb
,
C
,
ldc
,
1.0
f
,
0.0
f
,
{},
/* hasAlgo */
false
,
/* usingCublasLt */
true
);
}
}
void
CublasMMWrapper
::
Gemm
(
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
const
m
,
int
const
n
,
int
const
k
,
void
const
*
A
,
int
const
lda
,
void
const
*
B
,
int
const
ldb
,
void
*
C
,
int
const
ldc
,
float
f_alpha
,
float
f_beta
,
std
::
optional
<
cublasLtMatmulHeuristicResult_t
>
const
&
heuristic
)
{
if
(
heuristic
)
{
Gemm
(
transa
,
transb
,
m
,
n
,
k
,
A
,
lda
,
B
,
ldb
,
C
,
ldc
,
f_alpha
,
f_beta
,
/* hasAlgo */
(
*
heuristic
).
algo
,
(
*
heuristic
).
state
==
CUBLAS_STATUS_SUCCESS
&&
(
*
heuristic
).
workspaceSize
<
CUBLAS_WORKSPACE_SIZE
,
/* usingCublasLt */
true
);
}
else
{
Gemm
(
transa
,
transb
,
m
,
n
,
k
,
A
,
lda
,
B
,
ldb
,
C
,
ldc
,
f_alpha
,
f_beta
,
{},
/* hasAlgo */
false
,
/* usingCublasLt */
true
);
}
}
void
CublasMMWrapper
::
Gemm
(
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
const
m
,
int
const
n
,
int
const
k
,
void
const
*
A
,
int
const
lda
,
void
const
*
B
,
int
const
ldb
,
void
*
C
,
int
const
ldc
,
float
f_alpha
,
float
f_beta
)
{
bool
usingCublasLt
=
mAType
==
CUDA_R_16F
||
mAType
==
CUDA_R_8F_E4M3
;
Gemm
(
transa
,
transb
,
m
,
n
,
k
,
A
,
lda
,
B
,
ldb
,
C
,
ldc
,
f_alpha
,
f_beta
,
{},
/* hasAlgo */
false
,
/* usingCublasLt */
usingCublasLt
);
}
void
CublasMMWrapper
::
Gemm
(
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
const
m
,
int
const
n
,
int
const
k
,
void
const
*
A
,
int
const
lda
,
void
const
*
B
,
int
const
ldb
,
void
*
C
,
int
const
ldc
,
float
f_alpha
,
float
f_beta
,
cublasLtMatmulAlgo_t
const
&
algo
,
bool
hasAlgo
,
bool
usingCublasLt
)
{
half
h_alpha
=
(
half
)
(
f_alpha
);
half
h_beta
=
(
half
)
(
f_beta
);
// TODO: default cublas libs
usingCublasLt
=
usingCublasLt
&&
(
mAType
==
CUDA_R_16F
||
mAType
==
CUDA_R_8F_E4M3
);
bool
isFp16ComputeType
=
mComputeType
==
CUBLAS_COMPUTE_16F
;
int
batch_count
=
1
;
// fp32 use cublas as default
// fp16 use cublasLt as default
void
const
*
alpha
=
isFp16ComputeType
?
reinterpret_cast
<
void
*>
(
&
h_alpha
)
:
reinterpret_cast
<
void
*>
(
&
f_alpha
);
void
const
*
beta
=
isFp16ComputeType
?
reinterpret_cast
<
void
*>
(
&
h_beta
)
:
reinterpret_cast
<
void
*>
(
&
f_beta
);
int
workspaceSize
=
mCublasWorkspace
==
NULL
?
0
:
CUBLAS_WORKSPACE_SIZE
;
if
(
usingCublasLt
)
{
if
(
hasAlgo
)
{
hasAlgo
=
checkTactic
(
transa
,
transb
,
m
,
n
,
k
,
lda
,
ldb
,
ldc
,
algo
);
}
check_cuda_error
(
cublasLtMatmul
(
getCublasLtHandle
(),
mOperationDesc
,
alpha
,
A
,
mADesc
,
B
,
mBDesc
,
beta
,
C
,
mCDesc
,
C
,
mCDesc
,
(
hasAlgo
?
(
&
algo
)
:
NULL
),
mCublasWorkspace
,
workspaceSize
,
mStream
));
sync_check_cuda_error
();
}
else
{
check_cuda_error
(
cublasSetStream
(
getCublasHandle
(),
mStream
));
check_cuda_error
(
cublasSetWorkspace
(
getCublasHandle
(),
mCublasWorkspace
,
workspaceSize
));
// Go with default heuristic to choose tactic as cuBLAS does not allow to choose tactics in Ampere+
cublasGemmAlgo_t
cublasAlgo
=
CUBLAS_GEMM_DEFAULT
;
check_cuda_error
(
cublasGemmEx
(
getCublasHandle
(),
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
mAType
,
lda
,
B
,
mBType
,
ldb
,
beta
,
C
,
mCType
,
ldc
,
mComputeType
,
static_cast
<
cublasGemmAlgo_t
>
(
cublasAlgo
)));
sync_check_cuda_error
();
}
}
void
CublasMMWrapper
::
stridedBatchedGemm
(
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
const
m
,
int
const
n
,
int
const
k
,
void
const
*
A
,
int
const
lda
,
const
int64_t
strideA
,
void
const
*
B
,
int
const
ldb
,
const
int64_t
strideB
,
void
*
C
,
int
const
ldc
,
const
int64_t
strideC
,
int
const
batchCount
,
float
const
f_alpha
,
float
const
f_beta
)
{
half
h_alpha
=
(
half
)
f_alpha
;
half
h_beta
=
(
half
)
f_beta
;
int
isFp16ComputeType
=
mComputeType
==
CUBLAS_COMPUTE_16F
?
1
:
0
;
void
const
*
alpha
=
isFp16ComputeType
?
reinterpret_cast
<
void
*>
(
&
h_alpha
)
:
reinterpret_cast
<
void
const
*>
(
&
f_alpha
);
void
const
*
beta
=
isFp16ComputeType
?
reinterpret_cast
<
void
*>
(
&
h_beta
)
:
reinterpret_cast
<
void
const
*>
(
&
f_beta
);
check_cuda_error
(
cublasGemmStridedBatchedEx
(
getCublasHandle
(),
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
mAType
,
lda
,
strideA
,
B
,
mBType
,
ldb
,
strideB
,
beta
,
C
,
mCType
,
ldc
,
strideC
,
batchCount
,
mComputeType
,
mAType
==
CUDA_R_32F
?
CUBLAS_GEMM_DEFAULT
:
CUBLAS_GEMM_DEFAULT_TENSOR_OP
));
}
void
CublasMMWrapper
::
stridedBatchedGemm
(
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
const
m
,
int
const
n
,
int
const
k
,
float
const
f_alpha
,
void
const
*
A
,
cudaDataType_t
AType
,
int
const
lda
,
const
int64_t
strideA
,
void
const
*
B
,
cudaDataType_t
BType
,
int
const
ldb
,
const
int64_t
strideB
,
float
const
f_beta
,
void
*
C
,
cudaDataType_t
CType
,
int
const
ldc
,
const
int64_t
strideC
,
int
const
batchCount
,
cudaDataType_t
computeType
)
{
half
h_alpha
=
(
half
)
f_alpha
;
half
h_beta
=
(
half
)
f_beta
;
bool
isFp16ComputeType
=
mComputeType
==
CUBLAS_COMPUTE_16F
?
1
:
0
;
void
const
*
alpha
=
isFp16ComputeType
?
reinterpret_cast
<
void
*>
(
&
h_alpha
)
:
reinterpret_cast
<
void
const
*>
(
&
f_alpha
);
void
const
*
beta
=
isFp16ComputeType
?
reinterpret_cast
<
void
*>
(
&
h_beta
)
:
reinterpret_cast
<
void
const
*>
(
&
f_beta
);
check_cuda_error
(
cublasGemmStridedBatchedEx
(
getCublasHandle
(),
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
AType
,
lda
,
strideA
,
B
,
BType
,
ldb
,
strideB
,
beta
,
C
,
CType
,
ldc
,
strideC
,
batchCount
,
computeType
,
mAType
==
CUDA_R_32F
?
CUBLAS_GEMM_DEFAULT
:
CUBLAS_GEMM_DEFAULT_TENSOR_OP
));
}
void
CublasMMWrapper
::
setWorkspace
(
void
*
workspace
)
{
mCublasWorkspace
=
workspace
;
}
void
CublasMMWrapper
::
setFP32GemmConfig
()
{
setGemmConfig
(
CUDA_R_32F
,
CUDA_R_32F
,
CUDA_R_32F
,
CUDA_R_32F
);
}
void
CublasMMWrapper
::
setFP16GemmConfig
(
cudaDataType_t
outputType
)
{
setGemmConfig
(
CUDA_R_16F
,
CUDA_R_16F
,
outputType
,
CUDA_R_32F
);
}
#ifdef ENABLE_BF16
void
CublasMMWrapper
::
setBF16GemmConfig
(
cudaDataType_t
outputType
)
{
setGemmConfig
(
CUDA_R_16BF
,
CUDA_R_16BF
,
outputType
,
CUDA_R_32F
);
}
#endif
#ifdef ENABLE_FP8
void
CublasMMWrapper
::
setFP8GemmConfig
(
cudaDataType_t
outputType
)
{
setGemmConfig
(
CUDA_R_8F_E4M3
,
CUDA_R_8F_E4M3
,
outputType
,
CUDA_R_32F
);
}
#endif
void
CublasMMWrapper
::
setGemmConfig
(
cudaDataType_t
aType
,
cudaDataType_t
bType
,
cudaDataType_t
cType
,
cudaDataType_t
computeType
)
{
mAType
=
aType
;
mBType
=
bType
;
mCType
=
cType
;
bool
isFp16ComputeType
=
computeType
==
CUDA_R_16F
;
if
(
isFp16ComputeType
)
{
mComputeType
=
CUBLAS_COMPUTE_16F
;
mScaleType
=
CUDA_R_16F
;
}
else
{
mComputeType
=
CUBLAS_COMPUTE_32F
;
mScaleType
=
CUDA_R_32F
;
}
}
CublasDataType
CublasMMWrapper
::
getCublasDataType
(
cudaDataType_t
data_type
)
{
if
(
data_type
==
CUDA_R_16F
)
{
return
HALF_DATATYPE
;
}
else
if
(
data_type
==
CUDA_R_32F
)
{
return
FLOAT_DATATYPE
;
}
else
if
(
data_type
==
CUDA_R_8I
)
{
return
INT8_DATATYPE
;
}
#ifdef ENABLE_BF16
else
if
(
data_type
==
CUDA_R_16BF
)
{
return
BFLOAT16_DATATYPE
;
}
#endif
return
FLOAT_DATATYPE
;
}
void
CublasMMWrapper
::
setStream
(
cudaStream_t
stream
)
{
mStream
=
stream
;
}
bool
CublasMMWrapper
::
checkTactic
(
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
const
m
,
int
const
n
,
int
const
k
,
int
const
lda
,
int
const
ldb
,
int
const
ldc
,
cublasLtMatmulAlgo_t
const
&
algo
)
{
TLLM_CHECK_WITH_INFO
(
descriptorsCreated
(),
"Descriptors are not created! Call createDescriptors before calling this function"
);
int
workspaceSize
=
mCublasWorkspace
==
NULL
?
0
:
CUBLAS_WORKSPACE_SIZE
;
cublasLtMatmulHeuristicResult_t
heurResult
;
cublasStatus_t
algoStatus
=
cublasLtMatmulAlgoCheck
(
getCublasLtHandle
(),
mOperationDesc
,
mADesc
,
mBDesc
,
mCDesc
,
mCDesc
,
&
algo
,
&
heurResult
);
if
(
algoStatus
!=
CUBLAS_STATUS_SUCCESS
||
heurResult
.
state
!=
CUBLAS_STATUS_SUCCESS
||
heurResult
.
workspaceSize
>
CUBLAS_WORKSPACE_SIZE
)
{
return
false
;
}
sync_check_cuda_error
();
return
true
;
}
std
::
vector
<
cublasLtMatmulHeuristicResult_t
>
CublasMMWrapper
::
getTactics
(
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
const
m
,
int
const
n
,
int
const
k
,
int
const
lda
,
int
const
ldb
,
int
const
ldc
)
{
TLLM_CHECK_WITH_INFO
(
descriptorsCreated
(),
"Descriptors are not created! Call createDescriptors before calling this function"
);
auto
const
heuristics
=
getTactics
(
getCublasLtHandle
(),
mOperationDesc
,
mADesc
,
mBDesc
,
mCDesc
,
mCDesc
);
sync_check_cuda_error
();
return
heuristics
;
}
std
::
vector
<
cublasLtMatmulHeuristicResult_t
>
CublasMMWrapper
::
getTactics
(
cublasLtHandle_t
lightHandle
,
cublasLtMatmulDesc_t
computeDesc
,
cublasLtMatrixLayout_t
Adesc
,
cublasLtMatrixLayout_t
Bdesc
,
cublasLtMatrixLayout_t
Cdesc
,
cublasLtMatrixLayout_t
Ddesc
)
{
#if TLLM_CUBLAS_VER_LE(11, 4, 2)
TLLM_CHECK_WITH_INFO
(
false
,
"CUBLAS version too low, must be > 11.4.2."
);
return
{};
#else
std
::
vector
<
cublasLtMatmulHeuristicResult_t
>
heuristics
(
200
);
cublasLtMatmulPreference_t
preference
;
check_cuda_error
(
cublasLtMatmulPreferenceCreate
(
&
preference
));
check_cuda_error
(
cublasLtMatmulPreferenceInit
(
preference
));
uint64_t
workspace_size
=
CUBLAS_WORKSPACE_SIZE
;
check_cuda_error
(
cublasLtMatmulPreferenceSetAttribute
(
preference
,
CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES
,
&
workspace_size
,
sizeof
(
workspace_size
)));
// Restrict reduction algorithms for numerical stability and better determinism
uint32_t
reduction_mask
=
CUBLASLT_REDUCTION_SCHEME_MASK
;
check_cuda_error
(
cublasLtMatmulPreferenceSetAttribute
(
preference
,
CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK
,
&
reduction_mask
,
sizeof
(
reduction_mask
)));
#if TLLM_CUBLAS_VER_LT(12, 0, 0)
uint32_t
pointer_mode_mask
=
0
;
check_cuda_error
(
cublasLtMatmulPreferenceSetAttribute
(
preference
,
CUBLASLT_MATMUL_PREF_EPILOGUE_MASK
,
&
pointer_mode_mask
,
sizeof
(
pointer_mode_mask
)));
#endif
int
return_count
=
0
;
check_cuda_error
(
cublasLtMatmulAlgoGetHeuristic
(
lightHandle
,
computeDesc
,
Adesc
,
Bdesc
,
Cdesc
,
Ddesc
,
preference
,
heuristics
.
size
(),
heuristics
.
data
(),
&
return_count
));
heuristics
.
resize
(
return_count
);
return
heuristics
;
#endif
}
}
// namespace common
}
// namespace tensorrt_llm
sgl-kernel/3rdparty/tensorrt_llm/common/cublasMMWrapper.h
deleted
100644 → 0
View file @
9829e77e
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "tensorrt_llm/common/cudaUtils.h"
#include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <map>
#include <optional>
#include <string>
namespace
tensorrt_llm
{
namespace
common
{
class
CublasMMWrapper
{
protected:
std
::
shared_ptr
<
cublasHandle_t
>
mCublasHandle
;
std
::
shared_ptr
<
cublasLtHandle_t
>
mCublasLtHandle
;
cudaDataType_t
mAType
{};
cudaDataType_t
mBType
{};
cudaDataType_t
mCType
{};
cublasComputeType_t
mComputeType
{};
cudaDataType_t
mScaleType
{};
cublasLtMatmulDesc_t
mOperationDesc
{
NULL
};
cublasLtMatrixLayout_t
mADesc
{
NULL
};
cublasLtMatrixLayout_t
mBDesc
{
NULL
};
cublasLtMatrixLayout_t
mCDesc
{
NULL
};
cudaStream_t
mStream
;
void
*
mCublasWorkspace
=
nullptr
;
private:
bool
descriptorsCreated
()
const
{
return
mOperationDesc
!=
NULL
&&
mADesc
!=
NULL
&&
mBDesc
!=
NULL
&&
mCDesc
!=
NULL
;
}
public:
CublasMMWrapper
(
std
::
shared_ptr
<
cublasHandle_t
>
cublasHandle
,
std
::
shared_ptr
<
cublasLtHandle_t
>
cublasLtHandle
,
cudaStream_t
stream
,
void
*
workspace
);
~
CublasMMWrapper
();
CublasMMWrapper
(
CublasMMWrapper
const
&
wrapper
);
/********************** GEMMs **********************/
void
Gemm
(
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
const
m
,
int
const
n
,
int
const
k
,
void
const
*
A
,
int
const
lda
,
void
const
*
B
,
int
const
ldb
,
void
*
C
,
int
const
ldc
);
void
Gemm
(
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
const
m
,
int
const
n
,
int
const
k
,
void
const
*
A
,
int
const
lda
,
void
const
*
B
,
int
const
ldb
,
void
*
C
,
int
const
ldc
,
std
::
optional
<
cublasLtMatmulHeuristicResult_t
>
const
&
algo
);
void
Gemm
(
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
const
m
,
int
const
n
,
int
const
k
,
void
const
*
A
,
int
const
lda
,
void
const
*
B
,
int
const
ldb
,
void
*
C
,
int
const
ldc
,
float
f_alpha
,
float
f_beta
,
std
::
optional
<
cublasLtMatmulHeuristicResult_t
>
const
&
algo
);
void
Gemm
(
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
const
m
,
int
const
n
,
int
const
k
,
void
const
*
A
,
int
const
lda
,
void
const
*
B
,
int
const
ldb
,
void
*
C
,
int
const
ldc
,
float
f_alpha
,
float
f_beta
);
void
Gemm
(
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
const
m
,
int
const
n
,
int
const
k
,
void
const
*
A
,
int
const
lda
,
void
const
*
B
,
int
const
ldb
,
void
*
C
,
int
const
ldc
,
float
f_alpha
,
float
f_beta
,
cublasLtMatmulAlgo_t
const
&
algo
,
bool
hasAlgo
,
bool
usingCublasLt
);
void
stridedBatchedGemm
(
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
const
m
,
int
const
n
,
int
const
k
,
void
const
*
A
,
int
const
lda
,
const
int64_t
strideA
,
void
const
*
B
,
int
const
ldb
,
const
int64_t
strideB
,
void
*
C
,
int
const
ldc
,
const
int64_t
strideC
,
int
const
batchCount
,
float
const
f_alpha
=
1.0
f
,
float
const
f_beta
=
0.0
f
);
void
stridedBatchedGemm
(
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
const
m
,
int
const
n
,
int
const
k
,
float
const
f_alpha
,
void
const
*
A
,
cudaDataType_t
AType
,
int
const
lda
,
const
int64_t
strideA
,
void
const
*
B
,
cudaDataType_t
BType
,
int
const
ldb
,
const
int64_t
strideB
,
float
const
f_beta
,
void
*
C
,
cudaDataType_t
CType
,
int
const
ldc
,
const
int64_t
strideC
,
int
const
batchCount
,
cudaDataType_t
computeType
);
/********************** Tactic selection helpers **********************/
bool
checkTactic
(
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
const
m
,
int
const
n
,
int
const
k
,
int
const
lda
,
int
const
ldb
,
int
const
ldc
,
cublasLtMatmulAlgo_t
const
&
algo
);
std
::
vector
<
cublasLtMatmulHeuristicResult_t
>
getTactics
(
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
const
m
,
int
const
n
,
int
const
k
,
int
const
lda
,
int
const
ldb
,
int
const
ldc
);
std
::
vector
<
cublasLtMatmulHeuristicResult_t
>
getTactics
(
cublasLtHandle_t
lightHandle
,
cublasLtMatmulDesc_t
computeDesc
,
cublasLtMatrixLayout_t
Adesc
,
cublasLtMatrixLayout_t
Bdesc
,
cublasLtMatrixLayout_t
Cdesc
,
cublasLtMatrixLayout_t
Ddesc
);
using
MatrixLayout
=
std
::
tuple
<
cudaDataType_t
,
cublasLtOrder_t
,
uint64_t
,
uint64_t
>
;
using
cache_idx_t
=
std
::
tuple
<
cublasLtMatmulDesc_t
,
std
::
array
<
MatrixLayout
,
4
>>
;
MatrixLayout
createMatrixLayout
(
cublasLtMatrixLayout_t
Mdesc
);
/********************** Utils **********************/
void
setWorkspace
(
void
*
workspace
);
void
setFP32GemmConfig
();
void
setFP16GemmConfig
(
cudaDataType_t
outputType
=
CUDA_R_16F
);
#ifdef ENABLE_BF16
void
setBF16GemmConfig
(
cudaDataType_t
outputType
=
CUDA_R_16BF
);
#endif
#ifdef ENABLE_FP8
void
setFP8GemmConfig
(
cudaDataType_t
outputType
=
CUDA_R_16F
);
#endif
void
setStream
(
cudaStream_t
stream
);
void
setGemmConfig
(
cudaDataType_t
aType
,
cudaDataType_t
bType
,
cudaDataType_t
cType
,
cudaDataType_t
computeType
);
CublasDataType
getCublasDataType
(
cudaDataType_t
data_type
);
void
createDescriptors
(
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
const
m
,
int
const
n
,
int
const
k
,
int
const
lda
,
int
const
ldb
,
int
const
ldc
,
int8_t
fastAcc
=
0
);
void
setScaleDescriptors
(
void
*
scale_a
,
void
*
scale_b
);
void
destroyDescriptors
();
cublasHandle_t
getCublasHandle
()
{
return
*
(
this
->
mCublasHandle
);
}
cublasLtHandle_t
getCublasLtHandle
()
const
{
return
*
(
this
->
mCublasLtHandle
);
}
};
}
// namespace common
}
// namespace tensorrt_llm
sgl-kernel/3rdparty/tensorrt_llm/common/cublasVersionCheck.h
deleted
100644 → 0
View file @
9829e77e
/*
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
// We don't want to include cublas_api.h. It contains the CUBLAS_VER_* macro
// definition which is not sufficient to determine if we include cublas.h,
// cublas_v2.h or cublasLt.h.
#define TLLM_CUBLAS_VERSION_CALC(MAJOR, MINOR, PATCH) (MAJOR * 10000 + MINOR * 100 + PATCH)
#define TLLM_CUBLAS_VER_LE(MAJOR, MINOR, PATCH) \
TLLM_CUBLAS_VERSION_CALC(CUBLAS_VER_MAJOR, CUBLAS_VER_MINOR, CUBLAS_VER_PATCH) \
<= TLLM_CUBLAS_VERSION_CALC(MAJOR, MINOR, PATCH)
#define TLLM_CUBLAS_VER_LT(MAJOR, MINOR, PATCH) \
TLLM_CUBLAS_VERSION_CALC(CUBLAS_VER_MAJOR, CUBLAS_VER_MINOR, CUBLAS_VER_PATCH) \
< TLLM_CUBLAS_VERSION_CALC(MAJOR, MINOR, PATCH)
#define TLLM_CUBLAS_VER_GE(MAJOR, MINOR, PATCH) \
TLLM_CUBLAS_VERSION_CALC(CUBLAS_VER_MAJOR, CUBLAS_VER_MINOR, CUBLAS_VER_PATCH) \
>= TLLM_CUBLAS_VERSION_CALC(MAJOR, MINOR, PATCH)
#define TLLM_CUBLAS_VER_GT(MAJOR, MINOR, PATCH) \
TLLM_CUBLAS_VERSION_CALC(CUBLAS_VER_MAJOR, CUBLAS_VER_MINOR, CUBLAS_VER_PATCH) \
> TLLM_CUBLAS_VERSION_CALC(MAJOR, MINOR, PATCH)
sgl-kernel/3rdparty/tensorrt_llm/common/cudaBf16Fallbacks.cuh
deleted
100644 → 0
View file @
9829e77e
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "tensorrt_llm/common/cudaBf16Wrapper.h"
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
namespace
tensorrt_llm
{
namespace
common
{
#ifdef ENABLE_BF16
inline
__device__
float2
bf1622float2
(
const
__nv_bfloat162
val
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float2
f_val
;
f_val
.
x
=
__low2float
(
val
);
f_val
.
y
=
__high2float
(
val
);
return
f_val
;
#else
return
__bfloat1622float2
(
val
);
#endif
}
inline
__device__
int16_t
bf1622int16
(
__nv_bfloat162
val
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float2
f_val
;
f_val
.
x
=
max
(
min
(
__low2float
(
val
),
127.
f
),
-
128.
f
);
f_val
.
y
=
max
(
min
(
__high2float
(
val
),
127.
f
),
-
128.
f
);
union
{
int8_t
int8
[
2
];
int16_t
int16
;
};
int8
[
0
]
=
static_cast
<
int8_t
>
(
static_cast
<
short
>
(
f_val
.
x
));
int8
[
1
]
=
static_cast
<
int8_t
>
(
static_cast
<
short
>
(
f_val
.
y
));
return
int16
;
#else
val
=
__hmin2
(
val
,
make_bfloat162
(
127.
,
127.
));
val
=
__hmax2
(
val
,
make_bfloat162
(
-
128.
,
-
128.
));
union
{
int8_t
int8
[
2
];
int16_t
int16
;
};
int8
[
0
]
=
static_cast
<
int8_t
>
(
static_cast
<
short
>
(
val
.
x
));
int8
[
1
]
=
static_cast
<
int8_t
>
(
static_cast
<
short
>
(
val
.
y
));
return
int16
;
#endif
}
inline
__device__
__nv_bfloat162
float22bf162
(
const
float2
val
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return
__floats2bfloat162_rn
(
val
.
x
,
val
.
y
);
#else
return
__float22bfloat162_rn
(
val
);
#endif
}
inline
__device__
__nv_bfloat162
bf162bf162
(
const
__nv_bfloat16
val
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
__nv_bfloat162
val2
;
val2
.
x
=
val
;
val2
.
y
=
val
;
return
val2
;
#else
return
__bfloat162bfloat162
(
val
);
#endif
}
inline
__device__
__nv_bfloat162
bf16hadd2
(
const
__nv_bfloat162
x
,
const
__nv_bfloat162
y
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float
fxl
,
fxh
,
fyl
,
fyh
;
fxl
=
__low2float
(
x
);
fxh
=
__high2float
(
x
);
fyl
=
__low2float
(
y
);
fyh
=
__high2float
(
y
);
return
__floats2bfloat162_rn
(
fxl
+
fyl
,
fxh
+
fyh
);
#else
return
__hadd2
(
x
,
y
);
#endif
}
inline
__device__
__nv_bfloat16
bf16hadd
(
const
__nv_bfloat16
x
,
const
__nv_bfloat16
y
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return
__float2bfloat16
(
__bfloat162float
(
x
)
+
__bfloat162float
(
y
));
#else
return
__hadd
(
x
,
y
);
#endif
}
inline
__device__
__nv_bfloat162
bf16hsub2
(
const
__nv_bfloat162
x
,
const
__nv_bfloat162
y
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float
fxl
,
fxh
,
fyl
,
fyh
;
fxl
=
__low2float
(
x
);
fxh
=
__high2float
(
x
);
fyl
=
__low2float
(
y
);
fyh
=
__high2float
(
y
);
return
__floats2bfloat162_rn
(
fxl
-
fyl
,
fxh
-
fyh
);
#else
return
__hsub2
(
x
,
y
);
#endif
}
inline
__device__
__nv_bfloat16
bf16hsub
(
const
__nv_bfloat16
x
,
const
__nv_bfloat16
y
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return
__float2bfloat16
(
__bfloat162float
(
x
)
-
__bfloat162float
(
y
));
#else
return
__hsub
(
x
,
y
);
#endif
}
inline
__device__
__nv_bfloat162
bf16hmul2
(
const
__nv_bfloat162
x
,
const
__nv_bfloat162
y
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float
fxl
,
fxh
,
fyl
,
fyh
;
fxl
=
__low2float
(
x
);
fxh
=
__high2float
(
x
);
fyl
=
__low2float
(
y
);
fyh
=
__high2float
(
y
);
return
__floats2bfloat162_rn
(
fxl
*
fyl
,
fxh
*
fyh
);
#else
return
__hmul2
(
x
,
y
);
#endif
}
inline
__device__
__nv_bfloat16
bf16hmul
(
const
__nv_bfloat16
x
,
const
__nv_bfloat16
y
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return
__float2bfloat16
(
__bfloat162float
(
x
)
*
__bfloat162float
(
y
));
#else
return
__hmul
(
x
,
y
);
#endif
}
inline
__device__
__nv_bfloat162
bf16hfma2
(
const
__nv_bfloat162
x
,
const
__nv_bfloat162
y
,
const
__nv_bfloat162
z
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float
fxl
,
fxh
,
fyl
,
fyh
,
fzl
,
fzh
;
fxl
=
__low2float
(
x
);
fxh
=
__high2float
(
x
);
fyl
=
__low2float
(
y
);
fyh
=
__high2float
(
y
);
fzl
=
__low2float
(
z
);
fzh
=
__high2float
(
z
);
return
__floats2bfloat162_rn
(
fxl
*
fyl
+
fzl
,
fxh
*
fyh
+
fzh
);
#else
return
__hfma2
(
x
,
y
,
z
);
#endif
}
inline
__device__
__nv_bfloat16
bf16hfma
(
const
__nv_bfloat16
x
,
const
__nv_bfloat16
y
,
const
__nv_bfloat16
z
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return
__float2bfloat16
(
__bfloat162float
(
x
)
*
__bfloat162float
(
y
)
+
__bfloat162float
(
z
));
#else
return
__hfma
(
x
,
y
,
z
);
#endif
}
inline
__device__
__nv_bfloat162
bf16exp2
(
const
__nv_bfloat162
x
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float
fxl
,
fxh
;
fxl
=
__low2float
(
x
);
fxh
=
__high2float
(
x
);
;
return
__floats2bfloat162_rn
(
expf
(
fxl
),
expf
(
fxh
));
#else
return
h2exp
(
x
);
#endif
}
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
#if defined(CUDART_VERSION) && (CUDART_VERSION < 12020)
inline
__device__
__nv_bfloat162
make_bfloat162
(
const
__nv_bfloat16
x
,
const
__nv_bfloat16
y
)
{
__nv_bfloat162
t
;
t
.
x
=
x
;
t
.
y
=
y
;
return
t
;
}
#endif
#endif
inline
__device__
__nv_bfloat16
bf16hadd
(
__nv_bfloat16
a
,
__nv_bfloat16
b
,
__nv_bfloat16
c
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return
__float2bfloat16
(
__bfloat162float
(
a
)
+
__bfloat162float
(
b
)
+
__bfloat162float
(
c
));
#else
return
a
+
b
+
c
;
#endif
}
inline
__device__
__nv_bfloat16
bf16hadd
(
__nv_bfloat16
a
,
__nv_bfloat16
b
,
__nv_bfloat16
c
,
__nv_bfloat16
d
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return
__float2bfloat16
(
__bfloat162float
(
a
)
+
__bfloat162float
(
b
)
+
__bfloat162float
(
c
)
+
__bfloat162float
(
d
));
#else
return
(
__nv_bfloat16
)
((
float
)
a
+
(
float
)
b
+
(
float
)
c
+
(
float
)
d
);
#endif
}
inline
__device__
__nv_bfloat162
bf16hadd2
(
__nv_bfloat162
a
,
__nv_bfloat162
b
,
__nv_bfloat162
c
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float
fal
,
fah
,
fbl
,
fbh
,
fcl
,
fch
;
fal
=
__low2float
(
a
);
fah
=
__high2float
(
a
);
fbl
=
__low2float
(
b
);
fbh
=
__high2float
(
b
);
fcl
=
__low2float
(
c
);
fch
=
__high2float
(
c
);
return
__floats2bfloat162_rn
(
fal
+
fbl
+
fcl
,
fah
+
fbh
+
fch
);
#else
return
a
+
b
+
c
;
#endif
}
inline
__device__
__nv_bfloat16
bf16hmul
(
__nv_bfloat16
a
,
__nv_bfloat16
b
,
__nv_bfloat16
c
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return
__float2bfloat16
(
__bfloat162float
(
a
)
*
__bfloat162float
(
b
)
*
__bfloat162float
(
c
));
#else
return
a
*
b
*
c
;
#endif
}
inline
__device__
__nv_bfloat162
bf16hmul2
(
__nv_bfloat162
a
,
__nv_bfloat162
b
,
__nv_bfloat162
c
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float
fal
,
fah
,
fbl
,
fbh
,
fcl
,
fch
;
fal
=
__low2float
(
a
);
fah
=
__high2float
(
a
);
fbl
=
__low2float
(
b
);
fbh
=
__high2float
(
b
);
fcl
=
__low2float
(
c
);
fch
=
__high2float
(
c
);
return
__floats2bfloat162_rn
(
fal
*
fbl
*
fcl
,
fah
*
fbh
*
fch
);
#else
return
a
*
b
*
c
;
#endif
}
inline
__device__
__nv_bfloat162
bf16hfma2
(
__nv_bfloat162
a
,
__nv_bfloat162
b
,
__nv_bfloat162
c
,
__nv_bfloat162
d
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float
fal
,
fah
,
fbl
,
fbh
,
fcl
,
fch
,
fdl
,
fdh
;
fal
=
__low2float
(
a
);
fah
=
__high2float
(
a
);
fbl
=
__low2float
(
b
);
fbh
=
__high2float
(
b
);
fcl
=
__low2float
(
c
);
fch
=
__high2float
(
c
);
fdl
=
__low2float
(
d
);
fdh
=
__high2float
(
d
);
return
__floats2bfloat162_rn
(
fal
*
fbl
*
fcl
+
fdl
,
fah
*
fbh
*
fch
+
fdh
);
#else
return
a
*
b
*
c
+
d
;
#endif
}
#endif // ENABLE_BF16
}
// namespace common
}
// namespace tensorrt_llm
// Operator definitions intentionally in global namespace
namespace
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
#if defined(CUDART_VERSION) && (CUDART_VERSION < 12020)
inline
__device__
__nv_bfloat162
operator
*
(
const
__nv_bfloat162
x
,
const
__nv_bfloat162
y
)
{
return
tensorrt_llm
::
common
::
bf16hmul2
(
x
,
y
);
};
inline
__device__
__nv_bfloat162
operator
+
(
const
__nv_bfloat162
x
,
const
__nv_bfloat162
y
)
{
return
tensorrt_llm
::
common
::
bf16hadd2
(
x
,
y
);
};
#endif
#endif
}
// namespace
sgl-kernel/3rdparty/tensorrt_llm/common/cudaBf16Wrapper.h
deleted
100644 → 0
View file @
9829e77e
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#ifdef ENABLE_BF16
#include <cuda_bf16.h>
#endif
sgl-kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.cpp
deleted
100644 → 0
View file @
9829e77e
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define CUDA_LIB_NAME "cuda"
#if defined(_WIN32)
#include <windows.h>
#define dllOpen(name) LoadLibrary("nv" name ".dll")
#define dllClose(handle) FreeLibrary(static_cast<HMODULE>(handle))
#define dllGetSym(handle, name) static_cast<void*>(GetProcAddress(static_cast<HMODULE>(handle), name))
#else // For non-Windows platforms
#include <dlfcn.h>
#define dllOpen(name) dlopen("lib" name ".so.1", RTLD_LAZY)
#define dllClose(handle) dlclose(handle)
#define dllGetSym(handle, name) dlsym(handle, name)
#endif // defined(_WIN32)
#include "cudaDriverWrapper.h"
#include "tensorrt_llm/common/assert.h"
#include <cstdio>
#include <cuda.h>
namespace
tensorrt_llm
::
common
{
std
::
shared_ptr
<
CUDADriverWrapper
>
CUDADriverWrapper
::
getInstance
()
{
static
std
::
mutex
mutex
;
static
std
::
weak_ptr
<
CUDADriverWrapper
>
instance
;
std
::
shared_ptr
<
CUDADriverWrapper
>
result
=
instance
.
lock
();
if
(
result
)
{
return
result
;
}
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex
);
result
=
instance
.
lock
();
if
(
!
result
)
{
result
=
std
::
shared_ptr
<
CUDADriverWrapper
>
(
new
CUDADriverWrapper
());
instance
=
result
;
}
return
result
;
}
CUDADriverWrapper
::
CUDADriverWrapper
()
:
handle
(
dllOpen
(
CUDA_LIB_NAME
))
{
TLLM_CHECK_WITH_INFO
(
handle
!=
nullptr
,
"CUDA driver library is not open correctly."
);
auto
load_sym
=
[](
void
*
handle
,
char
const
*
name
)
{
void
*
ret
=
dllGetSym
(
handle
,
name
);
return
ret
;
};
*
reinterpret_cast
<
void
**>
(
&
_cuGetErrorName
)
=
load_sym
(
handle
,
"cuGetErrorName"
);
*
reinterpret_cast
<
void
**>
(
&
_cuGetErrorMessage
)
=
load_sym
(
handle
,
"cuGetErrorMessage"
);
*
reinterpret_cast
<
void
**>
(
&
_cuFuncSetAttribute
)
=
load_sym
(
handle
,
"cuFuncSetAttribute"
);
*
reinterpret_cast
<
void
**>
(
&
_cuLinkComplete
)
=
load_sym
(
handle
,
"cuLinkComplete"
);
*
reinterpret_cast
<
void
**>
(
&
_cuModuleUnload
)
=
load_sym
(
handle
,
"cuModuleUnload"
);
*
reinterpret_cast
<
void
**>
(
&
_cuLinkDestroy
)
=
load_sym
(
handle
,
"cuLinkDestroy"
);
*
reinterpret_cast
<
void
**>
(
&
_cuModuleLoadData
)
=
load_sym
(
handle
,
"cuModuleLoadData"
);
*
reinterpret_cast
<
void
**>
(
&
_cuLinkCreate
)
=
load_sym
(
handle
,
"cuLinkCreate_v2"
);
*
reinterpret_cast
<
void
**>
(
&
_cuModuleGetFunction
)
=
load_sym
(
handle
,
"cuModuleGetFunction"
);
*
reinterpret_cast
<
void
**>
(
&
_cuModuleGetGlobal
)
=
load_sym
(
handle
,
"cuModuleGetGlobal_v2"
);
*
reinterpret_cast
<
void
**>
(
&
_cuLinkAddFile
)
=
load_sym
(
handle
,
"cuLinkAddFile_v2"
);
*
reinterpret_cast
<
void
**>
(
&
_cuLinkAddData
)
=
load_sym
(
handle
,
"cuLinkAddData_v2"
);
*
reinterpret_cast
<
void
**>
(
&
_cuLaunchCooperativeKernel
)
=
load_sym
(
handle
,
"cuLaunchCooperativeKernel"
);
*
reinterpret_cast
<
void
**>
(
&
_cuLaunchKernel
)
=
load_sym
(
handle
,
"cuLaunchKernel"
);
*
reinterpret_cast
<
void
**>
(
&
_cuTensorMapEncodeTiled
)
=
load_sym
(
handle
,
"cuTensorMapEncodeTiled"
);
*
reinterpret_cast
<
void
**>
(
&
_cuMemcpyDtoH
)
=
load_sym
(
handle
,
"cuMemcpyDtoH_v2"
);
}
CUDADriverWrapper
::~
CUDADriverWrapper
()
{
dllClose
(
handle
);
}
CUresult
CUDADriverWrapper
::
cuGetErrorName
(
CUresult
error
,
char
const
**
pStr
)
const
{
return
(
*
_cuGetErrorName
)(
error
,
pStr
);
}
CUresult
CUDADriverWrapper
::
cuGetErrorMessage
(
CUresult
error
,
char
const
**
pStr
)
const
{
return
(
*
_cuGetErrorMessage
)(
error
,
pStr
);
}
CUresult
CUDADriverWrapper
::
cuFuncSetAttribute
(
CUfunction
hfunc
,
CUfunction_attribute
attrib
,
int
value
)
const
{
return
(
*
_cuFuncSetAttribute
)(
hfunc
,
attrib
,
value
);
}
CUresult
CUDADriverWrapper
::
cuLinkComplete
(
CUlinkState
state
,
void
**
cubinOut
,
size_t
*
sizeOut
)
const
{
return
(
*
_cuLinkComplete
)(
state
,
cubinOut
,
sizeOut
);
}
CUresult
CUDADriverWrapper
::
cuModuleUnload
(
CUmodule
hmod
)
const
{
return
(
*
_cuModuleUnload
)(
hmod
);
}
CUresult
CUDADriverWrapper
::
cuLinkDestroy
(
CUlinkState
state
)
const
{
return
(
*
_cuLinkDestroy
)(
state
);
}
CUresult
CUDADriverWrapper
::
cuModuleLoadData
(
CUmodule
*
module
,
void
const
*
image
)
const
{
return
(
*
_cuModuleLoadData
)(
module
,
image
);
}
CUresult
CUDADriverWrapper
::
cuLinkCreate
(
unsigned
int
numOptions
,
CUjit_option
*
options
,
void
**
optionValues
,
CUlinkState
*
stateOut
)
const
{
return
(
*
_cuLinkCreate
)(
numOptions
,
options
,
optionValues
,
stateOut
);
}
CUresult
CUDADriverWrapper
::
cuModuleGetFunction
(
CUfunction
*
hfunc
,
CUmodule
hmod
,
char
const
*
name
)
const
{
return
(
*
_cuModuleGetFunction
)(
hfunc
,
hmod
,
name
);
}
CUresult
CUDADriverWrapper
::
cuModuleGetGlobal
(
CUdeviceptr
*
dptr
,
size_t
*
bytes
,
CUmodule
hmod
,
char
const
*
name
)
const
{
return
(
*
_cuModuleGetGlobal
)(
dptr
,
bytes
,
hmod
,
name
);
}
CUresult
CUDADriverWrapper
::
cuLinkAddFile
(
CUlinkState
state
,
CUjitInputType
type
,
char
const
*
path
,
unsigned
int
numOptions
,
CUjit_option
*
options
,
void
**
optionValues
)
const
{
return
(
*
_cuLinkAddFile
)(
state
,
type
,
path
,
numOptions
,
options
,
optionValues
);
}
CUresult
CUDADriverWrapper
::
cuLinkAddData
(
CUlinkState
state
,
CUjitInputType
type
,
void
*
data
,
size_t
size
,
char
const
*
name
,
unsigned
int
numOptions
,
CUjit_option
*
options
,
void
**
optionValues
)
const
{
return
(
*
_cuLinkAddData
)(
state
,
type
,
data
,
size
,
name
,
numOptions
,
options
,
optionValues
);
}
CUresult
CUDADriverWrapper
::
cuLaunchCooperativeKernel
(
CUfunction
f
,
unsigned
int
gridDimX
,
unsigned
int
gridDimY
,
unsigned
int
gridDimZ
,
unsigned
int
blockDimX
,
unsigned
int
blockDimY
,
unsigned
int
blockDimZ
,
unsigned
int
sharedMemBytes
,
CUstream
hStream
,
void
**
kernelParams
)
const
{
return
(
*
_cuLaunchCooperativeKernel
)(
f
,
gridDimX
,
gridDimY
,
gridDimZ
,
blockDimX
,
blockDimY
,
blockDimZ
,
sharedMemBytes
,
hStream
,
kernelParams
);
}
CUresult
CUDADriverWrapper
::
cuLaunchKernel
(
CUfunction
f
,
unsigned
int
gridDimX
,
unsigned
int
gridDimY
,
unsigned
int
gridDimZ
,
unsigned
int
blockDimX
,
unsigned
int
blockDimY
,
unsigned
int
blockDimZ
,
unsigned
int
sharedMemBytes
,
CUstream
hStream
,
void
**
kernelParams
,
void
**
extra
)
const
{
return
(
*
_cuLaunchKernel
)(
f
,
gridDimX
,
gridDimY
,
gridDimZ
,
blockDimX
,
blockDimY
,
blockDimZ
,
sharedMemBytes
,
hStream
,
kernelParams
,
extra
);
}
CUresult
CUDADriverWrapper
::
cuTensorMapEncodeTiled
(
CUtensorMap
*
tensorMap
,
CUtensorMapDataType
tensorDataType
,
cuuint32_t
tensorRank
,
void
*
globalAddress
,
cuuint64_t
const
*
globalDim
,
cuuint64_t
const
*
globalStrides
,
cuuint32_t
const
*
boxDim
,
cuuint32_t
const
*
elementStrides
,
CUtensorMapInterleave
interleave
,
CUtensorMapSwizzle
swizzle
,
CUtensorMapL2promotion
l2Promotion
,
CUtensorMapFloatOOBfill
oobFill
)
const
{
return
(
*
_cuTensorMapEncodeTiled
)(
tensorMap
,
tensorDataType
,
tensorRank
,
globalAddress
,
globalDim
,
globalStrides
,
boxDim
,
elementStrides
,
interleave
,
swizzle
,
l2Promotion
,
oobFill
);
}
CUresult
CUDADriverWrapper
::
cuMemcpyDtoH
(
void
*
dstHost
,
CUdeviceptr
srcDevice
,
size_t
ByteCount
)
const
{
return
(
*
_cuMemcpyDtoH
)(
dstHost
,
srcDevice
,
ByteCount
);
}
}
// namespace tensorrt_llm::common
sgl-kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.h
deleted
100644 → 0
View file @
9829e77e
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef CUDA_DRIVER_WRAPPER_H
#define CUDA_DRIVER_WRAPPER_H
#include "tensorrt_llm/common/assert.h"
#include <cstdio>
#include <cuda.h>
#include <memory>
#include <mutex>
namespace
tensorrt_llm
::
common
{
class
CUDADriverWrapper
{
public:
static
std
::
shared_ptr
<
CUDADriverWrapper
>
getInstance
();
~
CUDADriverWrapper
();
CUDADriverWrapper
(
CUDADriverWrapper
const
&
)
=
delete
;
CUDADriverWrapper
operator
=
(
CUDADriverWrapper
const
&
)
=
delete
;
CUDADriverWrapper
(
CUDADriverWrapper
&&
)
=
delete
;
CUDADriverWrapper
operator
=
(
CUDADriverWrapper
&&
)
=
delete
;
CUresult
cuGetErrorName
(
CUresult
error
,
char
const
**
pStr
)
const
;
CUresult
cuGetErrorMessage
(
CUresult
error
,
char
const
**
pStr
)
const
;
CUresult
cuFuncSetAttribute
(
CUfunction
hfunc
,
CUfunction_attribute
attrib
,
int
value
)
const
;
CUresult
cuLinkComplete
(
CUlinkState
state
,
void
**
cubinOut
,
size_t
*
sizeOut
)
const
;
CUresult
cuModuleUnload
(
CUmodule
hmod
)
const
;
CUresult
cuLinkDestroy
(
CUlinkState
state
)
const
;
CUresult
cuModuleLoadData
(
CUmodule
*
module
,
void
const
*
image
)
const
;
CUresult
cuLinkCreate
(
unsigned
int
numOptions
,
CUjit_option
*
options
,
void
**
optionValues
,
CUlinkState
*
stateOut
)
const
;
CUresult
cuModuleGetFunction
(
CUfunction
*
hfunc
,
CUmodule
hmod
,
char
const
*
name
)
const
;
CUresult
cuModuleGetGlobal
(
CUdeviceptr
*
dptr
,
size_t
*
bytes
,
CUmodule
hmod
,
char
const
*
name
)
const
;
CUresult
cuLinkAddFile
(
CUlinkState
state
,
CUjitInputType
type
,
char
const
*
path
,
unsigned
int
numOptions
,
CUjit_option
*
options
,
void
**
optionValues
)
const
;
CUresult
cuLinkAddData
(
CUlinkState
state
,
CUjitInputType
type
,
void
*
data
,
size_t
size
,
char
const
*
name
,
unsigned
int
numOptions
,
CUjit_option
*
options
,
void
**
optionValues
)
const
;
CUresult
cuLaunchCooperativeKernel
(
CUfunction
f
,
unsigned
int
gridDimX
,
unsigned
int
gridDimY
,
unsigned
int
gridDimZ
,
unsigned
int
blockDimX
,
unsigned
int
blockDimY
,
unsigned
int
blockDimZ
,
unsigned
int
sharedMemBytes
,
CUstream
hStream
,
void
**
kernelParams
)
const
;
CUresult
cuLaunchKernel
(
CUfunction
f
,
unsigned
int
gridDimX
,
unsigned
int
gridDimY
,
unsigned
int
gridDimZ
,
unsigned
int
blockDimX
,
unsigned
int
blockDimY
,
unsigned
int
blockDimZ
,
unsigned
int
sharedMemBytes
,
CUstream
hStream
,
void
**
kernelParams
,
void
**
extra
)
const
;
CUresult
cuTensorMapEncodeTiled
(
CUtensorMap
*
tensorMap
,
CUtensorMapDataType
tensorDataType
,
cuuint32_t
tensorRank
,
void
*
globalAddress
,
cuuint64_t
const
*
globalDim
,
cuuint64_t
const
*
globalStrides
,
cuuint32_t
const
*
boxDim
,
cuuint32_t
const
*
elementStrides
,
CUtensorMapInterleave
interleave
,
CUtensorMapSwizzle
swizzle
,
CUtensorMapL2promotion
l2Promotion
,
CUtensorMapFloatOOBfill
oobFill
)
const
;
CUresult
cuMemcpyDtoH
(
void
*
dstHost
,
CUdeviceptr
srcDevice
,
size_t
ByteCount
)
const
;
private:
void
*
handle
;
CUDADriverWrapper
();
CUresult
(
*
_cuGetErrorName
)(
CUresult
,
char
const
**
);
CUresult
(
*
_cuGetErrorMessage
)(
CUresult
,
char
const
**
);
CUresult
(
*
_cuFuncSetAttribute
)(
CUfunction
,
CUfunction_attribute
,
int
);
CUresult
(
*
_cuLinkComplete
)(
CUlinkState
,
void
**
,
size_t
*
);
CUresult
(
*
_cuModuleUnload
)(
CUmodule
);
CUresult
(
*
_cuLinkDestroy
)(
CUlinkState
);
CUresult
(
*
_cuLinkCreate
)(
unsigned
int
,
CUjit_option
*
,
void
**
,
CUlinkState
*
);
CUresult
(
*
_cuModuleLoadData
)(
CUmodule
*
,
void
const
*
);
CUresult
(
*
_cuModuleGetFunction
)(
CUfunction
*
,
CUmodule
,
char
const
*
);
CUresult
(
*
_cuModuleGetGlobal
)(
CUdeviceptr
*
,
size_t
*
,
CUmodule
,
char
const
*
);
CUresult
(
*
_cuLinkAddFile
)(
CUlinkState
,
CUjitInputType
,
char
const
*
,
unsigned
int
,
CUjit_option
*
,
void
**
);
CUresult
(
*
_cuLinkAddData
)(
CUlinkState
,
CUjitInputType
,
void
*
,
size_t
,
char
const
*
,
unsigned
int
,
CUjit_option
*
,
void
**
);
CUresult
(
*
_cuLaunchCooperativeKernel
)(
CUfunction
,
unsigned
int
,
unsigned
int
,
unsigned
int
,
unsigned
int
,
unsigned
int
,
unsigned
int
,
unsigned
int
,
CUstream
,
void
**
);
CUresult
(
*
_cuLaunchKernel
)(
CUfunction
f
,
unsigned
int
gridDimX
,
unsigned
int
gridDimY
,
unsigned
int
gridDimZ
,
unsigned
int
blockDimX
,
unsigned
int
blockDimY
,
unsigned
int
blockDimZ
,
unsigned
int
sharedMemBytes
,
CUstream
hStream
,
void
**
kernelParams
,
void
**
extra
);
CUresult
(
*
_cuTensorMapEncodeTiled
)(
CUtensorMap
*
tensorMap
,
CUtensorMapDataType
tensorDataType
,
cuuint32_t
tensorRank
,
void
*
globalAddress
,
cuuint64_t
const
*
globalDim
,
cuuint64_t
const
*
globalStrides
,
cuuint32_t
const
*
boxDim
,
cuuint32_t
const
*
elementStrides
,
CUtensorMapInterleave
interleave
,
CUtensorMapSwizzle
swizzle
,
CUtensorMapL2promotion
l2Promotion
,
CUtensorMapFloatOOBfill
oobFill
);
CUresult
(
*
_cuMemcpyDtoH
)(
void
*
dstHost
,
CUdeviceptr
srcDevice
,
size_t
ByteCount
);
};
template
<
typename
T
>
void
checkDriver
(
T
result
,
CUDADriverWrapper
const
&
wrap
,
char
const
*
const
func
,
char
const
*
const
file
,
int
const
line
)
{
if
(
result
)
{
char
const
*
errorName
=
nullptr
;
char
const
*
errorMsg
=
nullptr
;
wrap
.
cuGetErrorName
(
result
,
&
errorName
);
wrap
.
cuGetErrorMessage
(
result
,
&
errorMsg
);
throw
TllmException
(
file
,
line
,
fmtstr
(
"[TensorRT-LLM][ERROR] CUDA driver error in %s: %s: %s"
,
func
,
errorName
,
errorMsg
));
}
}
}
// namespace tensorrt_llm::common
/*
* Macros compliant with TensorRT coding conventions
*/
#define TLLM_CU_CHECK(stat) \
do \
{ \
tensorrt_llm::common::checkDriver( \
(stat), *tensorrt_llm::common::CUDADriverWrapper::getInstance(), #stat, __FILE__, __LINE__); \
} while (0)
#endif // CUDA_DRIVER_WRAPPER_H
sgl-kernel/3rdparty/tensorrt_llm/common/cudaFp8Utils.cu
deleted
100644 → 0
View file @
9829e77e
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "tensorrt_llm/common/cudaFp8Utils.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/reduceKernelUtils.cuh"
#include <algorithm>
#include <cstdio>
#include <cuda_fp16.h>
#include <limits>
#include <type_traits>
namespace
tensorrt_llm
{
namespace
common
{
#ifdef ENABLE_FP8
constexpr
int
CTA_SIZE
=
256
;
template
<
bool
QUANTIZE
>
__inline__
__device__
float
scale
(
float
a
,
float
b
)
{
return
QUANTIZE
?
a
/
b
:
a
*
b
;
}
template
<
QuantizeMode
QUANTIZE_MODE
,
bool
QUANTIZE
,
typename
T_OUT
,
typename
T_S
,
typename
T_IN
>
__global__
void
scaleMatrix
(
T_OUT
*
output
,
T_S
const
*
input_scale
,
T_IN
const
*
input
,
int64_t
numel
,
int64_t
lda
)
{
for
(
int64_t
i
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
i
<
numel
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
if
(
QUANTIZE_MODE
==
QuantizeMode
::
PER_CHANNEL
)
{
output
[
i
]
=
T_OUT
(
scale
<
QUANTIZE
>
(
static_cast
<
float
>
(
input
[
i
]),
static_cast
<
float
>
(
input_scale
[
i
%
lda
])));
}
else
if
(
QUANTIZE_MODE
==
QuantizeMode
::
PER_TOKEN
)
{
output
[
i
]
=
T_OUT
(
scale
<
QUANTIZE
>
(
static_cast
<
float
>
(
input
[
i
]),
static_cast
<
float
>
(
input_scale
[
i
/
lda
])));
}
else
if
(
QUANTIZE_MODE
==
QuantizeMode
::
PER_TENSOR
)
{
output
[
i
]
=
T_OUT
(
scale
<
QUANTIZE
>
(
static_cast
<
float
>
(
input
[
i
]),
static_cast
<
float
>
(
input_scale
[
0
])));
}
}
}
template
<
typename
T_OUT
,
typename
T_S
,
typename
T_IN
>
void
invokeQuantizeMatrix
(
T_OUT
*
output
,
T_S
const
*
input_scale
,
T_IN
const
*
input
,
int64_t
numel
,
int64_t
lda
,
QuantizeMode
quantize_mode
,
cudaStream_t
stream
)
{
dim3
grid
(
1024
);
dim3
block
(
CTA_SIZE
);
if
(
quantize_mode
==
QuantizeMode
::
PER_CHANNEL
)
{
scaleMatrix
<
QuantizeMode
::
PER_CHANNEL
,
true
>
<<<
grid
,
block
,
0
,
stream
>>>
(
output
,
input_scale
,
input
,
numel
,
lda
);
}
else
if
(
quantize_mode
==
QuantizeMode
::
PER_TOKEN
)
{
scaleMatrix
<
QuantizeMode
::
PER_TOKEN
,
true
><<<
grid
,
block
,
0
,
stream
>>>
(
output
,
input_scale
,
input
,
numel
,
lda
);
}
else
if
(
quantize_mode
==
QuantizeMode
::
PER_TENSOR
)
{
scaleMatrix
<
QuantizeMode
::
PER_TENSOR
,
true
><<<
grid
,
block
,
0
,
stream
>>>
(
output
,
input_scale
,
input
,
numel
,
lda
);
}
sync_check_cuda_error
();
}
template
<
typename
T_OUT
,
typename
T_S
,
typename
T_IN
>
void
invokeDequantizeMatrix
(
T_OUT
*
output
,
T_S
const
*
input_scale
,
T_IN
const
*
input
,
int64_t
numel
,
int64_t
lda
,
QuantizeMode
quantize_mode
,
cudaStream_t
stream
)
{
dim3
grid
(
1024
);
dim3
block
(
CTA_SIZE
);
if
(
quantize_mode
==
QuantizeMode
::
PER_CHANNEL
)
{
scaleMatrix
<
QuantizeMode
::
PER_CHANNEL
,
false
>
<<<
grid
,
block
,
0
,
stream
>>>
(
output
,
input_scale
,
input
,
numel
,
lda
);
}
else
if
(
quantize_mode
==
QuantizeMode
::
PER_TOKEN
)
{
scaleMatrix
<
QuantizeMode
::
PER_TOKEN
,
false
><<<
grid
,
block
,
0
,
stream
>>>
(
output
,
input_scale
,
input
,
numel
,
lda
);
}
else
if
(
quantize_mode
==
QuantizeMode
::
PER_TENSOR
)
{
scaleMatrix
<
QuantizeMode
::
PER_TENSOR
,
false
>
<<<
grid
,
block
,
0
,
stream
>>>
(
output
,
input_scale
,
input
,
numel
,
lda
);
}
sync_check_cuda_error
();
}
template
<
typename
T_FAKE
,
typename
T_OUT
,
typename
T_IN
>
__global__
void
fakeQuantize
(
T_OUT
*
dst
,
const
T_IN
*
src
,
const
int64_t
numel
)
{
for
(
int64_t
tid
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
tid
<
numel
;
tid
+=
blockDim
.
x
*
gridDim
.
x
)
{
T_FAKE
tmp
=
(
T_FAKE
)
(
static_cast
<
float
>
(
src
[
tid
]));
dst
[
tid
]
=
(
T_OUT
)
(
static_cast
<
float
>
(
tmp
));
}
}
template
<
typename
T_FAKE
,
typename
T_OUT
,
typename
T_IN
>
void
invokeFakeQuantize
(
T_OUT
*
dst
,
const
T_IN
*
src
,
const
int64_t
numel
,
cudaStream_t
stream
)
{
fakeQuantize
<
T_FAKE
><<<
1024
,
CTA_SIZE
,
0
,
stream
>>>
(
dst
,
src
,
numel
);
sync_check_cuda_error
();
}
template
void
invokeFakeQuantize
<
__nv_fp8_e4m3
,
float
,
float
>(
float
*
dst
,
float
const
*
src
,
const
int64_t
numel
,
cudaStream_t
stream
);
template
void
invokeFakeQuantize
<
float
,
float
,
__nv_fp8_e4m3
>(
float
*
dst
,
__nv_fp8_e4m3
const
*
src
,
const
int64_t
numel
,
cudaStream_t
stream
);
template
void
invokeFakeQuantize
<
__nv_fp8_e4m3
,
half
,
half
>(
half
*
dst
,
half
const
*
src
,
const
int64_t
numel
,
cudaStream_t
stream
);
template
void
invokeFakeQuantize
<
__nv_fp8_e4m3
,
__nv_bfloat16
,
__nv_bfloat16
>(
__nv_bfloat16
*
dst
,
__nv_bfloat16
const
*
src
,
const
int64_t
numel
,
cudaStream_t
stream
);
template
void
invokeFakeQuantize
<
float
,
half
,
float
>(
half
*
dst
,
float
const
*
src
,
const
int64_t
numel
,
cudaStream_t
stream
);
__device__
float
atomicMaxExtd
(
float
*
address
,
float
val
)
{
assert
(
val
>=
0
);
unsigned
int
*
address_as_u
=
reinterpret_cast
<
unsigned
int
*>
(
address
);
unsigned
int
old
=
atomicMax
(
address_as_u
,
__float_as_uint
(
val
));
return
__uint_as_float
(
old
);
}
template
<
typename
T
>
inline
__device__
T
atomicMaxExtdV2
(
T
*
address
,
T
val
)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
static_assert
(
std
::
is_same_v
<
T
,
half
>
|
std
::
is_same_v
<
T
,
__nv_bfloat16
>
,
"T needs to be either half or bfloat16"
);
// The address in 64 bits.
uint64_t
address_u64
=
reinterpret_cast
<
uint64_t
const
&>
(
address
);
// Pack the input value into 32 bits.
union
{
T
v
[
2
];
uint16_t
u
[
2
];
}
old
,
tmp
=
{};
int
const
loc
=
(
address_u64
&
0x2
)
>>
1
;
tmp
.
v
[
loc
]
=
val
;
// 4B aligned pointer.
auto
aligned_address
=
reinterpret_cast
<
T
*>
(
address_u64
&
~
0x3ull
);
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
asm
volatile
(
"atom.global.v2.f16.max.noftz {%0, %1}, [%2], {%3, %4};"
:
"=h"
(
old
.
u
[
0
]),
"=h"
(
old
.
u
[
1
])
:
"l"
(
aligned_address
),
"h"
(
tmp
.
u
[
0
]),
"h"
(
tmp
.
u
[
1
]));
}
if
constexpr
(
std
::
is_same_v
<
T
,
__nv_bfloat16
>
)
{
asm
volatile
(
"atom.global.v2.bf16.max.noftz {%0, %1}, [%2], {%3, %4};"
:
"=h"
(
old
.
u
[
0
]),
"=h"
(
old
.
u
[
1
])
:
"l"
(
aligned_address
),
"h"
(
tmp
.
u
[
0
]),
"h"
(
tmp
.
u
[
1
]));
}
// Return the correct half.
return
old
.
v
[
loc
];
#endif
}
__device__
half
atomicMaxExtd
(
half
*
address
,
half
val
)
{
unsigned
short
int
*
address_as_u
=
reinterpret_cast
<
unsigned
short
int
*>
(
address
);
unsigned
short
int
old
=
*
address_as_u
,
assumed
;
while
(
val
>
__ushort_as_half
(
old
))
{
assumed
=
old
;
old
=
atomicCAS
(
address_as_u
,
assumed
,
__half_as_ushort
(
val
));
}
return
__ushort_as_half
(
old
);
}
__device__
__nv_bfloat16
atomicMaxExtd
(
__nv_bfloat16
*
address
,
__nv_bfloat16
val
)
{
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
unsigned
short
int
*
address_as_u
=
reinterpret_cast
<
unsigned
short
int
*>
(
address
);
unsigned
short
int
old
=
*
address_as_u
,
assumed
;
while
(
val
>
__ushort_as_bfloat16
(
old
))
{
assumed
=
old
;
old
=
atomicCAS
(
address_as_u
,
assumed
,
__bfloat16_as_ushort
(
val
));
}
return
__ushort_as_bfloat16
(
old
);
#else
assert
(
0
);
asm
volatile
(
"brkpt;
\n
"
::
);
return
__nv_bfloat16
(
0
);
#endif
}
template
<
QuantizeMode
QUANTIZE_MODE
,
typename
T_S
,
typename
T_W
>
__global__
void
computeFP8QuantizeScale
(
T_S
*
quant_ptr
,
const
T_W
*
weights
,
const
int64_t
size
,
const
int64_t
n
)
{
constexpr
float
min_scaling_factor
=
1.0
f
/
(
FP8_E4M3_MAX
*
512.
f
);
if
(
QUANTIZE_MODE
==
QuantizeMode
::
PER_CHANNEL
)
{
for
(
int64_t
col
=
threadIdx
.
x
;
col
<
n
;
col
+=
blockDim
.
x
)
{
float
max
=
0.
f
;
for
(
int64_t
i
=
col
+
n
*
blockIdx
.
x
;
i
<
size
;
i
+=
gridDim
.
x
*
n
)
{
auto
val
=
fabs
(
static_cast
<
float
>
(
weights
[
i
]));
max
=
max
>
val
?
max
:
val
;
}
auto
const
scale
=
(
T_S
)
std
::
max
(
max
/
FP8_E4M3_MAX
,
min_scaling_factor
);
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
if
constexpr
(
std
::
is_same_v
<
T_S
,
float
>
)
{
atomicMaxExtd
(
quant_ptr
+
col
,
scale
);
}
else
{
auto
const
address_u64
=
reinterpret_cast
<
uint64_t
>
(
quant_ptr
+
col
);
if
((
col
==
0
&&
address_u64
%
4
!=
0
)
||
(
col
==
n
-
1
&&
address_u64
%
4
==
0
))
atomicMaxExtd
(
quant_ptr
+
col
,
scale
);
else
atomicMaxExtdV2
(
quant_ptr
+
col
,
scale
);
}
#else // Vector atomics require __CUDA_ARCH__ >= 900
atomicMaxExtd
(
quant_ptr
+
col
,
scale
);
#endif
}
}
else
if
(
QUANTIZE_MODE
==
QuantizeMode
::
PER_TOKEN
)
{
auto
const
nrows
=
size
/
n
;
for
(
int64_t
row
=
blockIdx
.
x
;
row
<
nrows
;
row
+=
gridDim
.
x
)
{
float
max
=
0.
f
;
for
(
int64_t
i
=
threadIdx
.
x
;
i
<
n
;
i
+=
blockDim
.
x
)
{
auto
val
=
fabs
(
static_cast
<
float
>
(
weights
[
row
*
n
+
i
]));
max
=
max
>
val
?
max
:
val
;
}
max
=
blockReduceMax
<
float
>
(
max
);
if
(
threadIdx
.
x
==
0
)
{
auto
const
scale
=
(
T_S
)
std
::
max
(
max
/
FP8_E4M3_MAX
,
min_scaling_factor
);
quant_ptr
[
row
]
=
scale
;
}
}
}
else
if
(
QUANTIZE_MODE
==
QuantizeMode
::
PER_TENSOR
)
{
float
max
=
0.
f
;
for
(
int64_t
i
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
i
<
size
;
i
+=
gridDim
.
x
*
blockDim
.
x
)
{
auto
val
=
fabs
(
static_cast
<
float
>
(
weights
[
i
]));
max
=
max
>
val
?
max
:
val
;
}
max
=
blockReduceMax
<
float
>
(
max
);
if
(
threadIdx
.
x
==
0
)
{
auto
const
scale
=
(
T_S
)
std
::
max
(
max
/
FP8_E4M3_MAX
,
min_scaling_factor
);
atomicMaxExtd
(
quant_ptr
,
scale
);
}
}
}
template
<
typename
T_S
,
typename
T_W
>
void
invokeComputeFP8QuantizeScale
(
T_S
*
quant_ptr
,
const
T_W
*
weights
,
const
int64_t
numel
,
const
int64_t
lda
,
QuantizeMode
quantize_mode
,
cudaStream_t
stream
)
{
if
(
quantize_mode
==
QuantizeMode
::
PER_TOKEN
)
{
dim3
block
(
CTA_SIZE
);
dim3
grid
(
numel
/
lda
);
computeFP8QuantizeScale
<
QuantizeMode
::
PER_TOKEN
><<<
grid
,
block
,
0
,
stream
>>>
(
quant_ptr
,
weights
,
numel
,
lda
);
}
else
if
(
quantize_mode
==
QuantizeMode
::
PER_CHANNEL
)
{
dim3
block
(
CTA_SIZE
);
dim3
grid
((
lda
+
CTA_SIZE
-
1
)
/
CTA_SIZE
);
cudaMemsetAsync
(
quant_ptr
,
0
,
lda
*
sizeof
(
T_S
),
stream
);
sync_check_cuda_error
();
computeFP8QuantizeScale
<
QuantizeMode
::
PER_CHANNEL
><<<
grid
,
block
,
0
,
stream
>>>
(
quant_ptr
,
weights
,
numel
,
lda
);
}
else
if
(
quantize_mode
==
QuantizeMode
::
PER_TENSOR
)
{
dim3
block
(
1024
);
dim3
grid
(
1024
);
cudaMemsetAsync
(
quant_ptr
,
0
,
sizeof
(
T_S
),
stream
);
sync_check_cuda_error
();
computeFP8QuantizeScale
<
QuantizeMode
::
PER_TENSOR
><<<
grid
,
block
,
0
,
stream
>>>
(
quant_ptr
,
weights
,
numel
,
lda
);
}
sync_check_cuda_error
();
}
#define DEFINE_INVOKE_COMPUTE_FP8_QUANTIZE_SCALE(type_scale, type_in) \
template void invokeComputeFP8QuantizeScale<type_scale, type_in>(type_scale * input_scale, type_in const* weights, \
int64_t numel, int64_t lda, QuantizeMode quantize_mode, cudaStream_t stream);
DEFINE_INVOKE_COMPUTE_FP8_QUANTIZE_SCALE
(
half
,
half
);
DEFINE_INVOKE_COMPUTE_FP8_QUANTIZE_SCALE
(
float
,
half
);
DEFINE_INVOKE_COMPUTE_FP8_QUANTIZE_SCALE
(
float
,
float
);
#ifdef ENABLE_BF16
DEFINE_INVOKE_COMPUTE_FP8_QUANTIZE_SCALE
(
__nv_bfloat16
,
__nv_bfloat16
);
DEFINE_INVOKE_COMPUTE_FP8_QUANTIZE_SCALE
(
float
,
__nv_bfloat16
);
#endif
template
<
typename
T_OUT
,
typename
T_S
,
typename
T_IN
>
__global__
void
dynamicQuantizeMatrixPerToken
(
T_OUT
*
output
,
T_S
*
quant_ptr
,
T_IN
const
*
input
,
int64_t
numel
,
int64_t
lda
)
{
extern
__shared__
__align__
(
sizeof
(
float
))
char
_shmem
[];
T_IN
*
shmem
=
reinterpret_cast
<
T_IN
*>
(
_shmem
);
constexpr
float
min_scaling_factor
=
1.0
f
/
(
FP8_E4M3_MAX
*
512.
f
);
auto
const
nrows
=
numel
/
lda
;
for
(
int64_t
row
=
blockIdx
.
x
;
row
<
nrows
;
row
+=
gridDim
.
x
)
{
float
max
=
0.
f
;
for
(
int64_t
i
=
threadIdx
.
x
;
i
<
lda
;
i
+=
blockDim
.
x
)
{
auto
const
in
=
input
[
row
*
lda
+
i
];
shmem
[
i
]
=
in
;
auto
val
=
fabs
(
static_cast
<
float
>
(
in
));
max
=
max
>
val
?
max
:
val
;
}
max
=
blockAllReduceMax
<
float
>
(
max
);
// __syncthreads() called so we can read shmem
auto
const
s
=
(
T_S
)
std
::
max
(
max
/
FP8_E4M3_MAX
,
min_scaling_factor
);
for
(
int64_t
i
=
threadIdx
.
x
;
i
<
lda
;
i
+=
blockDim
.
x
)
{
// true means we are quantizing
output
[
row
*
lda
+
i
]
=
(
T_OUT
)
scale
<
true
>
(
static_cast
<
float
>
(
shmem
[
i
]),
static_cast
<
float
>
(
s
));
}
if
(
threadIdx
.
x
==
0
)
{
quant_ptr
[
row
]
=
s
;
}
}
}
template
<
typename
T_OUT
,
typename
T_S
,
typename
T_IN
>
void
invokeComputeScalesAndQuantizeMatrix
(
T_OUT
*
output
,
T_S
*
quant_ptr
,
const
T_IN
*
input
,
const
int64_t
numel
,
const
int64_t
lda
,
QuantizeMode
quantize_mode
,
cudaStream_t
stream
)
{
if
(
quantize_mode
==
QuantizeMode
::
PER_TOKEN
)
{
dim3
grid
(
numel
/
lda
);
bool
use_shmem
=
true
;
auto
const
shmem_size
=
lda
*
sizeof
(
T_IN
);
if
(
shmem_size
>=
(
48
<<
10
))
{
cudaError_t
ret
=
cudaFuncSetAttribute
(
dynamicQuantizeMatrixPerToken
<
T_OUT
,
T_S
,
T_IN
>
,
cudaFuncAttributeMaxDynamicSharedMemorySize
,
shmem_size
);
use_shmem
=
ret
==
cudaSuccess
;
}
if
(
use_shmem
)
{
// ensure the threadblock is as large as possible to increase occupancy
dim3
block
(
std
::
min
((
lda
+
31
)
/
32
*
32
,
static_cast
<
int64_t
>
(
1024
)));
dynamicQuantizeMatrixPerToken
<<<
grid
,
block
,
shmem_size
,
stream
>>>
(
output
,
quant_ptr
,
input
,
numel
,
lda
);
}
else
{
dim3
block
(
CTA_SIZE
);
computeFP8QuantizeScale
<
QuantizeMode
::
PER_TOKEN
><<<
grid
,
block
,
0
,
stream
>>>
(
quant_ptr
,
input
,
numel
,
lda
);
sync_check_cuda_error
();
invokeQuantizeMatrix
(
output
,
quant_ptr
,
input
,
numel
,
lda
,
quantize_mode
,
stream
);
}
}
else
if
(
quantize_mode
==
QuantizeMode
::
PER_CHANNEL
)
{
dim3
block
(
CTA_SIZE
);
dim3
grid
((
lda
+
CTA_SIZE
-
1
)
/
CTA_SIZE
);
cudaMemsetAsync
(
quant_ptr
,
0
,
lda
*
sizeof
(
T_S
),
stream
);
sync_check_cuda_error
();
computeFP8QuantizeScale
<
QuantizeMode
::
PER_CHANNEL
><<<
grid
,
block
,
0
,
stream
>>>
(
quant_ptr
,
input
,
numel
,
lda
);
sync_check_cuda_error
();
invokeQuantizeMatrix
(
output
,
quant_ptr
,
input
,
numel
,
lda
,
quantize_mode
,
stream
);
}
else
if
(
quantize_mode
==
QuantizeMode
::
PER_TENSOR
)
{
dim3
block
(
1024
);
dim3
grid
(
1024
);
cudaMemsetAsync
(
quant_ptr
,
0
,
sizeof
(
T_S
),
stream
);
sync_check_cuda_error
();
computeFP8QuantizeScale
<
QuantizeMode
::
PER_TENSOR
><<<
grid
,
block
,
0
,
stream
>>>
(
quant_ptr
,
input
,
numel
,
lda
);
sync_check_cuda_error
();
invokeQuantizeMatrix
(
output
,
quant_ptr
,
input
,
numel
,
lda
,
quantize_mode
,
stream
);
}
sync_check_cuda_error
();
}
#define DEFINE_INVOKE_QUANTIZE_MATRIX(type_out, type_scale, type_in) \
template void invokeQuantizeMatrix<type_out, type_scale, type_in>(type_out * output, \
type_scale const* input_scale, type_in const* input, int64_t numel, int64_t lda, QuantizeMode quantize_mode, \
cudaStream_t stream); \
template void invokeDequantizeMatrix<type_out, type_scale, type_in>(type_out * output, \
type_scale const* input_scale, type_in const* input, int64_t numel, int64_t lda, QuantizeMode quantize_mode, \
cudaStream_t stream); \
template void invokeComputeScalesAndQuantizeMatrix<type_out, type_scale, type_in>(type_out * output, \
type_scale * input_scale, type_in const* input, int64_t numel, int64_t lda, QuantizeMode quantize_mode, \
cudaStream_t stream);
#ifdef ENABLE_FP8
DEFINE_INVOKE_QUANTIZE_MATRIX
(
__nv_fp8_e4m3
,
float
,
float
);
DEFINE_INVOKE_QUANTIZE_MATRIX
(
__nv_fp8_e4m3
,
float
,
half
);
DEFINE_INVOKE_QUANTIZE_MATRIX
(
__nv_fp8_e4m3
,
half
,
half
);
DEFINE_INVOKE_QUANTIZE_MATRIX
(
half
,
half
,
__nv_fp8_e4m3
);
DEFINE_INVOKE_QUANTIZE_MATRIX
(
float
,
float
,
__nv_fp8_e4m3
);
DEFINE_INVOKE_QUANTIZE_MATRIX
(
half
,
float
,
__nv_fp8_e4m3
);
#ifdef ENABLE_BF16
DEFINE_INVOKE_QUANTIZE_MATRIX
(
__nv_fp8_e4m3
,
__nv_bfloat16
,
__nv_bfloat16
);
DEFINE_INVOKE_QUANTIZE_MATRIX
(
__nv_bfloat16
,
__nv_bfloat16
,
__nv_fp8_e4m3
);
#endif
#endif
#endif // ENABLE_FP8
}
// namespace common
}
// namespace tensorrt_llm
sgl-kernel/3rdparty/tensorrt_llm/common/cudaFp8Utils.h
deleted
100644 → 0
View file @
9829e77e
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#ifdef ENABLE_FP8
#include <cuda_fp8.h>
#include <cuda_runtime.h>
#include <stdint.h>
#define FP8_MHA
#define FUSE_GEMM_ACT
#define FP8_GEMM_OUTPUT_QUANT_DISABLE
#ifdef FUSE_GEMM_ACT
#define USE_QGMMA
#endif
namespace
tensorrt_llm
{
namespace
common
{
constexpr
float
FP8_E4M3_MAX
=
448.0
f
;
enum
QuantizeMode
{
PER_CHANNEL
,
PER_TENSOR
,
PER_CHANNEL_WEIGHT_PER_TENSOR_ACT
,
PER_TOKEN
,
};
// Packed Data Type
typedef
struct
__CUDA_ALIGN__
(
32
)
{
float
array
[
8
];
}
float8
;
typedef
struct
__CUDA_ALIGN__
(
16
)
{
half
array
[
8
];
}
half8
;
typedef
struct
__CUDA_ALIGN__
(
8
)
{
half2
array
[
2
];
}
half2_2
;
typedef
struct
__CUDA_ALIGN__
(
8
)
{
half
array
[
4
];
}
half_4
;
#ifdef ENABLE_BF16
typedef
struct
__CUDA_ALIGN__
(
4
)
{
__nv_bfloat16
array
[
2
];
}
__nv_bfloat16_2
;
typedef
struct
__CUDA_ALIGN__
(
8
)
{
__nv_bfloat162
x
,
y
;
}
__nv_bfloat162_2_xy
;
typedef
struct
__CUDA_ALIGN__
(
8
)
{
__nv_bfloat16
array
[
4
];
}
__nv_bfloat164
;
typedef
struct
__CUDA_ALIGN__
(
8
)
{
__nv_bfloat162
array
[
2
];
}
__nv_bfloat162_2
;
typedef
struct
__CUDA_ALIGN__
(
16
)
{
__nv_bfloat16
array
[
8
];
}
__nv_bfloat168
;
typedef
struct
__CUDA_ALIGN__
(
16
)
{
__nv_bfloat162
array
[
4
];
}
__nv_bfloat162_4
;
typedef
struct
__CUDA_ALIGN__
(
32
)
{
__nv_bfloat16
array
[
16
];
}
__nv_bfloat1616
;
#endif
#ifdef ENABLE_FP8
typedef
struct
__CUDA_ALIGN__
(
2
)
{
__nv_fp8_e4m3
array
[
2
];
}
__nv_fp8_2_e4m3
;
typedef
struct
__CUDA_ALIGN__
(
4
)
{
__nv_fp8_e4m3
array
[
4
];
}
__nv_fp8_4_e4m3
;
typedef
struct
__CUDA_ALIGN__
(
4
)
{
__nv_fp8x2_e4m3
array
[
2
];
}
__nv_fp8x2_x2_e4m3
;
typedef
struct
__CUDA_ALIGN__
(
8
)
{
__nv_fp8_e4m3
array
[
8
];
}
__nv_fp8_8_e4m3
;
typedef
struct
__CUDA_ALIGN__
(
8
)
{
__nv_fp8x2_e4m3
array
[
4
];
}
__nv_fp8x2_x4_e4m3
;
typedef
struct
__CUDA_ALIGN__
(
16
)
{
__nv_fp8_e4m3
array
[
16
];
}
__nv_fp8x16_e4m3
;
#endif
// only BF16 and FP8
template
<
typename
T
,
int
PACK_SIZE
>
struct
PackType
{
using
type
=
float
;
};
#ifdef ENABLE_BF16
template
<
>
struct
PackType
<
__nv_bfloat16
,
2
>
{
using
type
=
__nv_bfloat16_2
;
};
template
<
>
struct
PackType
<
__nv_bfloat16
,
4
>
{
using
type
=
__nv_bfloat164
;
};
template
<
>
struct
PackType
<
__nv_bfloat16
,
8
>
{
using
type
=
__nv_bfloat168
;
};
#endif
#ifdef ENABLE_FP8
template
<
>
struct
PackType
<
__nv_fp8_e4m3
,
2
>
{
using
type
=
__nv_fp8_2_e4m3
;
};
template
<
>
struct
PackType
<
__nv_fp8_e4m3
,
4
>
{
using
type
=
__nv_fp8_4_e4m3
;
};
template
<
>
struct
PackType
<
__nv_fp8_e4m3
,
8
>
{
using
type
=
__nv_fp8_8_e4m3
;
};
#endif
__inline__
__device__
void
fp8x4_e4m3_to_bfloat2
(
__nv_bfloat162
*
out1
,
__nv_bfloat162
*
out2
,
__nv_fp8x4_e4m3
const
*
in
)
{
const
char4
tmp_val
=
reinterpret_cast
<
char4
const
*>
(
in
)[
0
];
*
out1
=
__nv_bfloat162
((
float
)
reinterpret_cast
<
__nv_fp8_e4m3
const
*>
(
&
tmp_val
.
x
)[
0
],
(
float
)
reinterpret_cast
<
__nv_fp8_e4m3
const
*>
(
&
tmp_val
.
y
)[
0
]);
*
out2
=
__nv_bfloat162
((
float
)
reinterpret_cast
<
__nv_fp8_e4m3
const
*>
(
&
tmp_val
.
z
)[
0
],
(
float
)
reinterpret_cast
<
__nv_fp8_e4m3
const
*>
(
&
tmp_val
.
w
)[
0
]);
}
__inline__
__device__
__nv_bfloat162
fp8x2_e4m3_to_bfloat2
(
__nv_fp8x2_e4m3
const
*
in
)
{
const
char2
tmp_val
=
reinterpret_cast
<
char2
const
*>
(
in
)[
0
];
__nv_bfloat162
out
=
__nv_bfloat162
((
float
)
reinterpret_cast
<
__nv_fp8_e4m3
const
*>
(
&
tmp_val
.
x
)[
0
],
(
float
)
reinterpret_cast
<
__nv_fp8_e4m3
const
*>
(
&
tmp_val
.
y
)[
0
]);
return
out
;
}
__inline__
__device__
void
fp8x4_e4m3_to_half2
(
half2
*
out1
,
half2
*
out2
,
__nv_fp8x4_e4m3
const
*
in
)
{
const
char4
tmp_val
=
reinterpret_cast
<
char4
const
*>
(
in
)[
0
];
*
out1
=
half2
((
float
)
reinterpret_cast
<
__nv_fp8_e4m3
const
*>
(
&
tmp_val
.
x
)[
0
],
(
float
)
reinterpret_cast
<
__nv_fp8_e4m3
const
*>
(
&
tmp_val
.
y
)[
0
]);
*
out2
=
half2
((
float
)
reinterpret_cast
<
__nv_fp8_e4m3
const
*>
(
&
tmp_val
.
z
)[
0
],
(
float
)
reinterpret_cast
<
__nv_fp8_e4m3
const
*>
(
&
tmp_val
.
w
)[
0
]);
}
__inline__
__device__
half2
fp8x2_e4m3_to_half2
(
__nv_fp8x2_e4m3
const
*
in
)
{
const
char2
tmp_val
=
reinterpret_cast
<
char2
const
*>
(
in
)[
0
];
half2
out
=
half2
((
float
)
reinterpret_cast
<
__nv_fp8_e4m3
const
*>
(
&
tmp_val
.
x
)[
0
],
(
float
)
reinterpret_cast
<
__nv_fp8_e4m3
const
*>
(
&
tmp_val
.
y
)[
0
]);
return
out
;
}
template
<
typename
T_OUT
,
typename
T_S
,
typename
T_IN
>
void
invokeQuantizeMatrix
(
T_OUT
*
output
,
T_S
const
*
input_qua_amax_ptr
,
T_IN
const
*
input
,
int64_t
numel
,
int64_t
lda
,
QuantizeMode
quantize_mode
,
cudaStream_t
stream
);
template
<
typename
T_OUT
,
typename
T_S
,
typename
T_IN
>
void
invokeDequantizeMatrix
(
T_OUT
*
output
,
T_S
const
*
input_qua_amax_ptr
,
T_IN
const
*
input
,
int64_t
numel
,
int64_t
lda
,
QuantizeMode
quantize_mode
,
cudaStream_t
stream
);
template
<
typename
T_FAKE
,
typename
T_OUT
,
typename
T_IN
>
void
invokeFakeQuantize
(
T_OUT
*
dst
,
const
T_IN
*
src
,
const
int64_t
numel
,
cudaStream_t
stream
);
template
<
typename
T_S
,
typename
T_W
>
void
invokeComputeFP8QuantizeScale
(
T_S
*
quant_ptr
,
const
T_W
*
weights
,
const
int64_t
k
,
const
int64_t
lda
,
QuantizeMode
quantize_mode
,
cudaStream_t
stream
);
template
<
typename
T_OUT
,
typename
T_S
,
typename
T_IN
>
void
invokeComputeScalesAndQuantizeMatrix
(
T_OUT
*
output
,
T_S
*
quant_ptr
,
const
T_IN
*
weights
,
const
int64_t
numel
,
const
int64_t
lda
,
QuantizeMode
quantize_mode
,
cudaStream_t
stream
);
}
// namespace common
}
// namespace tensorrt_llm
#endif // ENABLE_FP8
sgl-kernel/3rdparty/tensorrt_llm/common/cudaTypeUtils.cuh
deleted
100644 → 0
View file @
9829e77e
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "tensorrt_llm/common/cudaBf16Fallbacks.cuh"
#include "tensorrt_llm/common/cudaBf16Wrapper.h"
#include "tensorrt_llm/common/cudaFp8Utils.h"
#include <assert.h>
#include <cuda.h>
#include <cuda_fp16.h>
#if ENABLE_BF16
#include <cuda_bf16.h>
#endif
namespace
tensorrt_llm
{
namespace
common
{
template
<
typename
T
>
inline
__device__
T
ldg
(
T
const
*
val
)
{
return
__ldg
(
val
);
}
#if ENABLE_BF16
template
<
>
inline
__device__
__nv_bfloat162
ldg
(
__nv_bfloat162
const
*
val
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return
val
[
0
];
#else
return
__ldg
(
val
);
#endif
}
template
<
>
inline
__device__
__nv_bfloat16
ldg
(
__nv_bfloat16
const
*
val
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return
val
[
0
];
#else
return
__ldg
(
val
);
#endif
}
#endif // ENABLE_BF16
// Get type2 from type or vice versa (applied to half and bfloat16)
template
<
typename
T
>
struct
TypeConverter
{
using
Type
=
half2
;
};
// keep for generality
template
<
>
struct
TypeConverter
<
half2
>
{
using
Type
=
half
;
};
template
<
>
struct
TypeConverter
<
half
>
{
using
Type
=
half2
;
};
#if ENABLE_BF16
template
<
>
struct
TypeConverter
<
__nv_bfloat162
>
{
using
Type
=
__nv_bfloat16
;
};
template
<
>
struct
TypeConverter
<
__nv_bfloat16
>
{
using
Type
=
__nv_bfloat162
;
};
#endif // ENABLE_BF16
// Defined math operations (bfloat16 fallback to fp32 when it is not supported)
template
<
typename
T
>
inline
__device__
T
hadd2
(
T
a
,
T
b
)
{
return
__hadd2
(
a
,
b
);
}
#if ENABLE_BF16
template
<
>
inline
__device__
__nv_bfloat162
hadd2
(
__nv_bfloat162
a
,
__nv_bfloat162
b
)
{
return
bf16hadd2
(
a
,
b
);
}
#endif // ENABLE_BF16
template
<
typename
T
>
inline
__device__
T
add
(
T
a
,
T
b
)
{
return
a
+
b
;
}
template
<
>
inline
__device__
half2
add
(
half2
a
,
half2
b
)
{
return
__hadd2
(
a
,
b
);
}
template
<
>
inline
__device__
half
add
(
half
a
,
half
b
)
{
return
__hadd
(
a
,
b
);
}
#if ENABLE_BF16
template
<
>
inline
__device__
__nv_bfloat162
add
(
__nv_bfloat162
a
,
__nv_bfloat162
b
)
{
return
bf16hadd2
(
a
,
b
);
}
template
<
>
inline
__device__
__nv_bfloat16
add
(
__nv_bfloat16
a
,
__nv_bfloat16
b
)
{
return
bf16hadd
(
a
,
b
);
}
inline
__device__
__nv_bfloat16
add
(
__nv_bfloat16
a
,
float
b
)
{
return
bf16hadd
(
a
,
__float2bfloat16
(
b
));
}
#endif // ENABLE_BF16
// applies to all 4 values addition
template
<
typename
T
>
inline
__device__
T
add
(
T
a
,
T
b
,
T
c
)
{
return
a
+
b
+
c
;
}
#if ENABLE_BF16
inline
__device__
__nv_bfloat16
add
(
__nv_bfloat16
a
,
__nv_bfloat16
b
,
__nv_bfloat16
c
)
{
return
bf16hadd
(
a
,
b
,
c
);
}
inline
__device__
__nv_bfloat162
add
(
__nv_bfloat162
a
,
__nv_bfloat162
b
,
__nv_bfloat162
c
)
{
return
bf16hadd2
(
a
,
b
,
c
);
}
#endif // ENABLE_BF16
// applies to all 4 values addition
template
<
typename
T
>
inline
__device__
T
add
(
T
a
,
T
b
,
T
c
,
T
d
)
{
return
(
T
)
((
float
)
a
+
(
float
)
b
+
(
float
)
c
+
(
float
)
d
);
}
#if ENABLE_BF16
inline
__device__
__nv_bfloat16
add
(
__nv_bfloat16
a
,
__nv_bfloat16
b
,
__nv_bfloat16
c
,
__nv_bfloat16
d
)
{
return
bf16hadd
(
a
,
b
,
c
,
d
);
}
#endif // ENABLE_BF16
template
<
typename
T
>
inline
__device__
T
hsub2
(
T
a
,
T
b
)
{
return
__hsub2
(
a
,
b
);
}
#if ENABLE_BF16
template
<
>
inline
__device__
__nv_bfloat162
hsub2
(
__nv_bfloat162
a
,
__nv_bfloat162
b
)
{
return
bf16hsub2
(
a
,
b
);
}
#endif // ENABLE_BF16
template
<
typename
T
>
inline
__device__
T
hmul2
(
T
a
,
T
b
)
{
return
__hmul2
(
a
,
b
);
}
#if ENABLE_BF16
template
<
>
inline
__device__
__nv_bfloat162
hmul2
(
__nv_bfloat162
a
,
__nv_bfloat162
b
)
{
return
bf16hmul2
(
a
,
b
);
}
#endif // ENABLE_BF16
template
<
typename
T
>
inline
__device__
T
hmul2
(
T
a
,
T
b
,
T
c
)
{
return
a
*
b
*
c
;
}
#if ENABLE_BF16
template
<
>
inline
__device__
__nv_bfloat162
hmul2
(
__nv_bfloat162
a
,
__nv_bfloat162
b
,
__nv_bfloat162
c
)
{
return
bf16hmul2
(
a
,
b
,
c
);
}
#endif // ENABLE_BF16
template
<
typename
T
>
inline
__device__
T
mul
(
T
a
,
T
b
,
T
c
)
{
return
a
*
b
*
c
;
}
#if ENABLE_BF16
template
<
>
inline
__device__
__nv_bfloat16
mul
(
__nv_bfloat16
a
,
__nv_bfloat16
b
,
__nv_bfloat16
c
)
{
return
bf16hmul
(
a
,
b
,
c
);
}
inline
__device__
__nv_bfloat162
mul
(
__nv_bfloat162
a
,
__nv_bfloat162
b
,
__nv_bfloat162
c
)
{
return
bf16hmul2
(
a
,
b
,
c
);
}
#endif // ENABLE_BF16
template
<
typename
T
>
inline
__device__
T
fma
(
T
a
,
T
b
,
T
c
,
T
d
)
{
return
a
*
b
*
c
+
d
;
}
#if ENABLE_BF16
inline
__device__
__nv_bfloat162
fma
(
__nv_bfloat162
a
,
__nv_bfloat162
b
,
__nv_bfloat162
c
,
__nv_bfloat162
d
)
{
return
bf16hfma2
(
a
,
b
,
c
,
d
);
}
#endif // ENABLE_BF16
template
<
typename
T
>
inline
__device__
T
fma
(
T
a
,
T
b
,
T
c
)
{
return
a
*
b
+
c
;
}
#if ENABLE_BF16
template
<
>
inline
__device__
__nv_bfloat162
fma
(
__nv_bfloat162
a
,
__nv_bfloat162
b
,
__nv_bfloat162
c
)
{
return
bf16hfma2
(
a
,
b
,
c
);
}
template
<
>
inline
__device__
__nv_bfloat16
fma
(
__nv_bfloat16
a
,
__nv_bfloat16
b
,
__nv_bfloat16
c
)
{
return
bf16hfma
(
a
,
b
,
c
);
}
#endif // ENABLE_BF16
template
<
typename
T
>
inline
__device__
T
hexp2
(
T
a
)
{
return
h2exp
(
a
);
}
#if ENABLE_BF16
template
<
>
inline
__device__
__nv_bfloat162
hexp2
(
__nv_bfloat162
a
)
{
return
bf16exp2
(
a
);
}
#endif // ENABLE_BF16
template
<
typename
T_OUT
,
typename
T_IN
>
__device__
inline
T_OUT
cuda_cast
(
T_IN
val
)
{
return
val
;
}
template
<
>
__device__
inline
float2
cuda_cast
<
float2
,
int2
>
(
int2
val
)
{
return
make_float2
(
val
.
x
,
val
.
y
);
}
template
<
>
__device__
inline
float2
cuda_cast
<
float2
,
float
>
(
float
val
)
{
return
make_float2
(
val
,
val
);
}
template
<
>
__device__
inline
float2
cuda_cast
<
float2
,
half2
>
(
half2
val
)
{
return
__half22float2
(
val
);
}
template
<
>
__device__
inline
half2
cuda_cast
<
half2
,
float2
>
(
float2
val
)
{
return
__float22half2_rn
(
val
);
}
template
<
>
__device__
inline
half2
cuda_cast
<
half2
,
float
>
(
float
val
)
{
return
__float2half2_rn
(
val
);
}
template
<
>
__device__
inline
half2
cuda_cast
<
half2
,
half
>
(
half
val
)
{
return
__half2half2
(
val
);
}
template
<
>
__device__
inline
int8_t
cuda_cast
<
int8_t
,
half
>
(
half
val
)
{
union
{
int8_t
int8
[
2
];
int16_t
int16
;
};
union
{
half
fp16
;
int16_t
int16_in
;
};
fp16
=
val
;
asm
volatile
(
"cvt.rni.sat.s8.f16 %0, %1;"
:
"=h"
(
int16
)
:
"h"
(
int16_in
));
return
int8
[
0
];
}
template
<
>
__device__
inline
int16_t
cuda_cast
<
int16_t
,
half2
>
(
half2
val
)
{
union
{
int8_t
int8
[
2
];
int16_t
int16
;
};
int8
[
0
]
=
cuda_cast
<
int8_t
>
(
val
.
x
);
int8
[
1
]
=
cuda_cast
<
int8_t
>
(
val
.
y
);
return
int16
;
}
template
<
>
__device__
inline
int8_t
cuda_cast
<
int8_t
,
float
>
(
float
val
)
{
union
{
int8_t
int8
[
2
];
int16_t
int16
;
};
asm
volatile
(
"cvt.rni.sat.s8.f32 %0, %1;"
:
"=h"
(
int16
)
:
"f"
(
val
));
return
int8
[
0
];
}
template
<
>
__device__
inline
int16_t
cuda_cast
<
int16_t
,
float2
>
(
float2
val
)
{
union
{
int8_t
int8
[
2
];
int16_t
int16
;
};
int8
[
0
]
=
cuda_cast
<
int8_t
>
(
val
.
x
);
int8
[
1
]
=
cuda_cast
<
int8_t
>
(
val
.
y
);
return
int16
;
}
template
<
>
__device__
inline
half2
cuda_cast
<
half2
,
int16_t
>
(
int16_t
val
)
{
union
{
int8_t
int8
[
2
];
int16_t
int16
;
};
int16
=
val
;
return
make_half2
(
int8
[
0
],
int8
[
1
]);
}
template
<
>
__device__
inline
float2
cuda_cast
<
float2
,
int16_t
>
(
int16_t
val
)
{
union
{
int8_t
int8
[
2
];
int16_t
int16
;
};
int16
=
val
;
return
make_float2
(
int8
[
0
],
int8
[
1
]);
}
#ifdef ENABLE_BF16
template
<
>
__device__
inline
__nv_bfloat16
cuda_cast
(
int32_t
val
)
{
return
static_cast
<
float
>
(
val
);
}
template
<
>
__device__
inline
__nv_bfloat16
cuda_cast
(
int8_t
val
)
{
return
static_cast
<
float
>
(
val
);
}
template
<
>
__device__
inline
int8_t
cuda_cast
(
__nv_bfloat16
val
)
{
return
static_cast
<
float
>
(
val
);
}
template
<
>
__device__
inline
float
cuda_cast
<
float
,
__nv_bfloat16
>
(
__nv_bfloat16
val
)
{
return
__bfloat162float
(
val
);
}
template
<
>
__device__
inline
float2
cuda_cast
<
float2
,
__nv_bfloat162
>
(
__nv_bfloat162
val
)
{
return
bf1622float2
(
val
);
}
template
<
>
__device__
inline
half
cuda_cast
<
half
,
__nv_bfloat16
>
(
__nv_bfloat16
val
)
{
return
__float2half
(
__bfloat162float
(
val
));
}
template
<
>
__device__
inline
int16_t
cuda_cast
<
int16_t
,
__nv_bfloat162
>
(
__nv_bfloat162
val
)
{
return
bf1622int16
(
val
);
}
template
<
>
__device__
inline
__nv_bfloat16
cuda_cast
<
__nv_bfloat16
,
float
>
(
float
val
)
{
return
__float2bfloat16
(
val
);
}
template
<
>
__device__
inline
__nv_bfloat16
cuda_cast
<
__nv_bfloat16
,
half
>
(
half
val
)
{
return
__float2bfloat16
(
__half2float
(
val
));
}
template
<
>
__device__
inline
__nv_bfloat162
cuda_cast
<
__nv_bfloat162
,
__nv_bfloat16
>
(
__nv_bfloat16
val
)
{
return
bf162bf162
(
val
);
}
template
<
>
__device__
inline
__nv_bfloat162
cuda_cast
<
__nv_bfloat162
,
float
>
(
float
val
)
{
return
__float2bfloat162_rn
(
val
);
}
template
<
>
__device__
inline
__nv_bfloat162
cuda_cast
<
__nv_bfloat162
,
float2
>
(
float2
val
)
{
return
float22bf162
(
val
);
}
template
<
>
__device__
inline
__nv_bfloat162
cuda_cast
<
__nv_bfloat162
,
int16_t
>
(
int16_t
val
)
{
union
{
int8_t
int8
[
2
];
int16_t
int16
;
};
int16
=
val
;
__nv_bfloat162
res
;
res
.
x
=
cuda_cast
<
__nv_bfloat16
>
(
int8
[
0
]);
res
.
y
=
cuda_cast
<
__nv_bfloat16
>
(
int8
[
1
]);
return
res
;
}
template
<
>
__device__
inline
__nv_bfloat162
cuda_cast
<
__nv_bfloat162
,
half2
>
(
half2
val
)
{
return
float22bf162
(
__half22float2
(
val
));
}
#endif // ENABLE BF16
template
<
typename
T
>
__device__
inline
T
cuda_abs
(
T
val
)
{
assert
(
false
);
return
{};
}
template
<
>
__device__
inline
float
cuda_abs
(
float
val
)
{
return
fabs
(
val
);
}
template
<
>
__device__
inline
float2
cuda_abs
(
float2
val
)
{
return
make_float2
(
fabs
(
val
.
x
),
fabs
(
val
.
y
));
}
template
<
>
__device__
inline
half
cuda_abs
(
half
val
)
{
return
__habs
(
val
);
}
template
<
>
__device__
inline
half2
cuda_abs
(
half2
val
)
{
return
__habs2
(
val
);
}
#ifdef ENABLE_BF16
#if __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)
template
<
>
__device__
inline
__nv_bfloat16
cuda_abs
(
__nv_bfloat16
val
)
{
return
__habs
(
val
);
}
template
<
>
__device__
inline
__nv_bfloat162
cuda_abs
(
__nv_bfloat162
val
)
{
return
__habs2
(
val
);
}
#endif
#endif // ENABLE_FP16
template
<
typename
To
,
typename
Ti
>
__device__
inline
To
cuda_sum
(
Ti
val
)
{
return
cuda_cast
<
To
>
(
val
);
};
template
<
typename
To
>
__device__
inline
To
cuda_sum
(
float2
val
)
{
return
cuda_cast
<
To
>
(
val
.
x
+
val
.
y
);
};
// Unary maximum: compute the max of a vector type
template
<
typename
To
,
typename
Ti
>
__device__
inline
To
cuda_max
(
Ti
val
)
{
return
cuda_cast
<
To
>
(
val
);
};
template
<
>
__device__
inline
float
cuda_max
(
float2
val
)
{
return
fmaxf
(
val
.
x
,
val
.
y
);
}
template
<
>
__device__
inline
half
cuda_max
(
half2
val
)
{
return
__hmax
(
val
.
x
,
val
.
y
);
}
#ifdef ENABLE_BF16
template
<
>
__device__
inline
__nv_bfloat16
cuda_max
(
__nv_bfloat162
val
)
{
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
return
__hmax
(
val
.
x
,
val
.
y
);
#else
assert
(
0
);
asm
volatile
(
"brkpt;
\n
"
::
);
return
__nv_bfloat16
(
0
);
#endif
}
#endif
// Binary maximum: compute the max of two values.
template
<
typename
T
>
__device__
inline
T
cuda_max
(
T
val1
,
T
val2
)
{
return
(
val1
>
val2
)
?
val1
:
val2
;
}
template
<
>
__device__
inline
float2
cuda_max
(
float2
val1
,
float2
val2
)
{
float2
out
;
out
.
x
=
fmaxf
(
val1
.
x
,
val2
.
x
);
out
.
y
=
fmaxf
(
val1
.
y
,
val2
.
y
);
return
out
;
}
template
<
>
__device__
inline
half2
cuda_max
(
half2
val1
,
half2
val2
)
{
return
__hmax2
(
val1
,
val2
);
}
#ifdef ENABLE_BF16
template
<
>
__device__
inline
__nv_bfloat162
cuda_max
(
__nv_bfloat162
val1
,
__nv_bfloat162
val2
)
{
return
__hmax2
(
val1
,
val2
);
}
#endif // ENABLE_BF16
// Binary maximum: compute the min of two values.
template
<
typename
T
>
__device__
inline
T
cuda_min
(
T
val1
,
T
val2
)
{
return
(
val1
<
val2
)
?
val1
:
val2
;
}
template
<
>
__device__
inline
float2
cuda_min
(
float2
val1
,
float2
val2
)
{
float2
out
;
out
.
x
=
fminf
(
val1
.
x
,
val2
.
x
);
out
.
y
=
fminf
(
val1
.
y
,
val2
.
y
);
return
out
;
}
template
<
>
__device__
inline
half2
cuda_min
(
half2
val1
,
half2
val2
)
{
return
__hmin2
(
val1
,
val2
);
}
#ifdef ENABLE_BF16
template
<
>
__device__
inline
__nv_bfloat162
cuda_min
(
__nv_bfloat162
val1
,
__nv_bfloat162
val2
)
{
return
__hmin2
(
val1
,
val2
);
}
#endif // ENABLE_BF16
// Helper function of clamping the val into the given range.
template
<
typename
T
>
inline
__device__
T
cuda_clamp
(
T
val
,
T
minVal
,
T
maxVal
)
{
return
cuda_min
(
cuda_max
(
val
,
minVal
),
maxVal
);
}
#ifdef ENABLE_FP8
template
<
>
__device__
inline
float2
cuda_cast
<
float2
,
__nv_fp8x2_e4m3
>
(
__nv_fp8x2_e4m3
val
)
{
return
bf1622float2
(
fp8x2_e4m3_to_bfloat2
(
&
val
));
}
template
<
>
__device__
inline
half2
cuda_cast
<
half2
,
__nv_fp8x2_e4m3
>
(
__nv_fp8x2_e4m3
val
)
{
return
fp8x2_e4m3_to_half2
(
&
val
);
}
template
<
>
__device__
inline
__nv_fp8x2_e4m3
cuda_cast
<
__nv_fp8x2_e4m3
,
float2
>
(
float2
val
)
{
return
__nv_fp8x2_e4m3
(
bf1622float2
(
float22bf162
(
val
)));
}
template
<
>
__device__
inline
__nv_fp8x2_e4m3
cuda_cast
<
__nv_fp8x2_e4m3
,
half2
>
(
half2
val
)
{
return
__nv_fp8x2_e4m3
(
cuda_cast
<
float2
>
(
val
));
}
template
<
>
__device__
inline
__nv_fp8x2_e4m3
cuda_cast
<
__nv_fp8x2_e4m3
,
__nv_bfloat162
>
(
__nv_bfloat162
val
)
{
return
__nv_fp8x2_e4m3
(
cuda_cast
<
float2
>
(
val
));
}
template
<
>
__device__
inline
__nv_fp8_e4m3
cuda_cast
<
__nv_fp8_e4m3
,
half
>
(
half
val
)
{
return
__nv_fp8_e4m3
(
val
);
}
template
<
>
__device__
inline
__nv_fp8_e4m3
cuda_cast
<
__nv_fp8_e4m3
,
__nv_bfloat16
>
(
__nv_bfloat16
val
)
{
return
__nv_fp8_e4m3
(
val
);
}
template
<
>
__device__
inline
__nv_fp8_e4m3
cuda_cast
<
__nv_fp8_e4m3
,
float
>
(
float
val
)
{
return
__nv_fp8_e4m3
(
val
);
}
template
<
>
__device__
inline
float
cuda_cast
<
float
,
__nv_fp8_e4m3
>
(
__nv_fp8_e4m3
val
)
{
return
(
float
)
val
;
}
template
<
>
__device__
inline
__nv_bfloat162
cuda_cast
<
__nv_bfloat162
,
__nv_fp8x2_e4m3
>
(
__nv_fp8x2_e4m3
val
)
{
return
fp8x2_e4m3_to_bfloat2
(
&
val
);
}
template
<
>
__device__
inline
int8_t
cuda_cast
<
int8_t
,
__nv_fp8_e4m3
>
(
__nv_fp8_e4m3
val
)
{
// no impl
return
0
;
}
template
<
>
__device__
inline
__nv_fp8_e4m3
cuda_cast
<
__nv_fp8_e4m3
,
int8_t
>
(
int8_t
val
)
{
return
cuda_cast
<
__nv_fp8_e4m3
>
(
cuda_cast
<
__nv_bfloat16
>
(
cuda_cast
<
float
>
(
val
)));
}
#endif // ENABLE_FP8
}
// namespace common
}
// namespace tensorrt_llm
sgl-kernel/3rdparty/tensorrt_llm/common/cudaUtils.h
deleted
100644 → 0
View file @
9829e77e
This diff is collapsed.
Click to expand it.
sgl-kernel/3rdparty/tensorrt_llm/common/logger.cpp
deleted
100644 → 0
View file @
9829e77e
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/tllmException.h"
#include <cuda_runtime.h>
namespace
tensorrt_llm
::
common
{
Logger
::
Logger
()
{
char
*
isFirstRankOnlyChar
=
std
::
getenv
(
"TLLM_LOG_FIRST_RANK_ONLY"
);
bool
isFirstRankOnly
=
(
isFirstRankOnlyChar
!=
nullptr
&&
std
::
string
(
isFirstRankOnlyChar
)
==
"ON"
);
auto
const
*
levelName
=
std
::
getenv
(
"TLLM_LOG_LEVEL"
);
if
(
levelName
!=
nullptr
)
{
auto
level
=
[
levelName
=
std
::
string
(
levelName
)]()
{
if
(
levelName
==
"TRACE"
)
return
TRACE
;
if
(
levelName
==
"DEBUG"
)
return
DEBUG
;
if
(
levelName
==
"INFO"
)
return
INFO
;
if
(
levelName
==
"WARNING"
)
return
WARNING
;
if
(
levelName
==
"ERROR"
)
return
ERROR
;
TLLM_THROW
(
"Invalid log level: %s"
,
levelName
.
c_str
());
}();
// If TLLM_LOG_FIRST_RANK_ONLY=ON, set LOG LEVEL of other device to ERROR
if
(
isFirstRankOnly
)
{
auto
const
deviceId
=
getDevice
();
if
(
deviceId
!=
1
)
{
level
=
ERROR
;
}
}
setLevel
(
level
);
}
}
void
Logger
::
log
(
std
::
exception
const
&
ex
,
Logger
::
Level
level
)
{
log
(
level
,
"%s: %s"
,
TllmException
::
demangle
(
typeid
(
ex
).
name
()).
c_str
(),
ex
.
what
());
}
Logger
*
Logger
::
getLogger
()
{
thread_local
Logger
instance
;
return
&
instance
;
}
}
// namespace tensorrt_llm::common
sgl-kernel/3rdparty/tensorrt_llm/common/logger.h
deleted
100644 → 0
View file @
9829e77e
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cstdlib>
#include <iostream>
#include <stdexcept>
#include <string>
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/stringUtils.h"
namespace
tensorrt_llm
::
common
{
class
Logger
{
// On Windows, the file wingdi.h is included which has
// #define ERROR 0
// This breaks everywhere ERROR is used in the Level enum
#ifdef _WIN32
#undef ERROR
#endif // _WIN32
public:
enum
Level
{
TRACE
=
0
,
DEBUG
=
10
,
INFO
=
20
,
WARNING
=
30
,
ERROR
=
40
};
static
Logger
*
getLogger
();
Logger
(
Logger
const
&
)
=
delete
;
void
operator
=
(
Logger
const
&
)
=
delete
;
#if defined(_MSC_VER)
template
<
typename
...
Args
>
void
log
(
Level
level
,
char
const
*
format
,
Args
const
&
...
args
);
template
<
typename
...
Args
>
void
log
(
Level
level
,
int
rank
,
char
const
*
format
,
Args
const
&
...
args
);
#else
template
<
typename
...
Args
>
void
log
(
Level
level
,
char
const
*
format
,
Args
const
&
...
args
)
__attribute__
((
format
(
printf
,
3
,
0
)));
template
<
typename
...
Args
>
void
log
(
Level
level
,
int
rank
,
char
const
*
format
,
Args
const
&
...
args
)
__attribute__
((
format
(
printf
,
4
,
0
)));
#endif
template
<
typename
...
Args
>
void
log
(
Level
level
,
std
::
string
const
&
format
,
Args
const
&
...
args
)
{
return
log
(
level
,
format
.
c_str
(),
args
...);
}
template
<
typename
...
Args
>
void
log
(
Level
const
level
,
int
const
rank
,
std
::
string
const
&
format
,
Args
const
&
...
args
)
{
return
log
(
level
,
rank
,
format
.
c_str
(),
args
...);
}
void
log
(
std
::
exception
const
&
ex
,
Level
level
=
Level
::
ERROR
);
Level
getLevel
()
const
{
return
level_
;
}
void
setLevel
(
Level
const
level
)
{
level_
=
level
;
log
(
INFO
,
"Set logger level to %s"
,
getLevelName
(
level
));
}
bool
isEnabled
(
Level
const
level
)
const
{
return
level_
<=
level
;
}
private:
static
auto
constexpr
kPREFIX
=
"[TensorRT-LLM]"
;
#ifndef NDEBUG
Level
const
DEFAULT_LOG_LEVEL
=
DEBUG
;
#else
Level
const
DEFAULT_LOG_LEVEL
=
INFO
;
#endif
Level
level_
=
DEFAULT_LOG_LEVEL
;
Logger
();
// NOLINT(modernize-use-equals-delete)
static
inline
char
const
*
getLevelName
(
Level
const
level
)
{
switch
(
level
)
{
case
TRACE
:
return
"TRACE"
;
case
DEBUG
:
return
"DEBUG"
;
case
INFO
:
return
"INFO"
;
case
WARNING
:
return
"WARNING"
;
case
ERROR
:
return
"ERROR"
;
}
TLLM_THROW
(
"Unknown log level: %d"
,
level
);
}
static
inline
std
::
string
getPrefix
(
Level
const
level
)
{
return
fmtstr
(
"%s[%s] "
,
kPREFIX
,
getLevelName
(
level
));
}
static
inline
std
::
string
getPrefix
(
Level
const
level
,
int
const
rank
)
{
return
fmtstr
(
"%s[%s][%d] "
,
kPREFIX
,
getLevelName
(
level
),
rank
);
}
};
template
<
typename
...
Args
>
void
Logger
::
log
(
Logger
::
Level
level
,
char
const
*
format
,
Args
const
&
...
args
)
{
if
(
isEnabled
(
level
))
{
auto
const
fmt
=
getPrefix
(
level
)
+
format
;
auto
&
out
=
level_
<
WARNING
?
std
::
cout
:
std
::
cerr
;
if
constexpr
(
sizeof
...(
args
)
>
0
)
{
out
<<
fmtstr
(
fmt
.
c_str
(),
args
...);
}
else
{
out
<<
fmt
;
}
out
<<
std
::
endl
;
}
}
template
<
typename
...
Args
>
void
Logger
::
log
(
Logger
::
Level
const
level
,
int
const
rank
,
char
const
*
format
,
Args
const
&
...
args
)
{
if
(
isEnabled
(
level
))
{
auto
const
fmt
=
getPrefix
(
level
,
rank
)
+
format
;
auto
&
out
=
level_
<
WARNING
?
std
::
cout
:
std
::
cerr
;
if
constexpr
(
sizeof
...(
args
)
>
0
)
{
out
<<
fmtstr
(
fmt
.
c_str
(),
args
...);
}
else
{
out
<<
fmt
;
}
out
<<
std
::
endl
;
}
}
#define TLLM_LOG(level, ...) \
do \
{ \
auto* const logger = tensorrt_llm::common::Logger::getLogger(); \
if (logger->isEnabled(level)) \
{ \
logger->log(level, __VA_ARGS__); \
} \
} while (0)
#define TLLM_LOG_TRACE(...) TLLM_LOG(tensorrt_llm::common::Logger::TRACE, __VA_ARGS__)
#define TLLM_LOG_DEBUG(...) TLLM_LOG(tensorrt_llm::common::Logger::DEBUG, __VA_ARGS__)
#define TLLM_LOG_INFO(...) TLLM_LOG(tensorrt_llm::common::Logger::INFO, __VA_ARGS__)
#define TLLM_LOG_WARNING(...) TLLM_LOG(tensorrt_llm::common::Logger::WARNING, __VA_ARGS__)
#define TLLM_LOG_ERROR(...) TLLM_LOG(tensorrt_llm::common::Logger::ERROR, __VA_ARGS__)
#define TLLM_LOG_EXCEPTION(ex, ...) tensorrt_llm::common::Logger::getLogger()->log(ex, ##__VA_ARGS__)
}
// namespace tensorrt_llm::common
sgl-kernel/3rdparty/tensorrt_llm/common/quantTypeUtils.cuh
deleted
100644 → 0
View file @
9829e77e
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "tensorrt_llm/common/cudaBf16Fallbacks.cuh"
#include "tensorrt_llm/common/cudaFp8Utils.h"
#include <cuda.h>
#include <cuda_fp16.h>
#include <float.h>
namespace
tensorrt_llm
{
namespace
common
{
template
<
typename
T
>
struct
QuantTypeStaticVals
;
template
<
>
struct
QuantTypeStaticVals
<
int8_t
>
{
static
constexpr
float
MAX_VAL
=
127.
f
;
static
constexpr
float
MIN_SCALING_FACTOR
=
0.
f
;
static
constexpr
float
MIN_SCALING_FACTOR_RCP
=
FLT_MAX
;
};
#ifdef ENABLE_FP8
template
<
>
struct
QuantTypeStaticVals
<
__nv_fp8_e4m3
>
{
static
constexpr
float
MAX_VAL
=
448.
f
;
// Ref: https://github.com/pytorch/FBGEMM/blob/main/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu#L720
static
constexpr
float
MIN_SCALING_FACTOR
=
1.0
f
/
(
448.
f
*
512.
f
);
static
constexpr
float
MIN_SCALING_FACTOR_RCP
=
(
448.
f
*
512.
f
);
};
#endif // ENABLE_FP8
}
// namespace common
}
// namespace tensorrt_llm
sgl-kernel/3rdparty/tensorrt_llm/common/quantization.h
deleted
100644 → 0
View file @
9829e77e
/*
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cstdint>
#include <optional>
#include <string>
namespace
tensorrt_llm
{
namespace
common
{
class
QuantMode
{
// [WARNING] KEEP BELOW DEFINITION IN SYNC WITH tensorrt_llm/quantization/mode.py
public:
using
BaseType
=
std
::
uint32_t
;
explicit
constexpr
QuantMode
(
BaseType
value
)
noexcept
:
mValue
{
value
}
{
}
QuantMode
()
noexcept
=
default
;
constexpr
QuantMode
(
QuantMode
const
&
)
noexcept
=
default
;
constexpr
QuantMode
&
operator
=
(
QuantMode
const
&
other
)
noexcept
=
default
;
static
constexpr
QuantMode
none
()
noexcept
{
return
QuantMode
(
BaseType
(
0
));
}
static
constexpr
QuantMode
int4Weights
()
noexcept
{
return
QuantMode
(
BaseType
(
1u
)
<<
0
);
}
static
constexpr
QuantMode
int8Weights
()
noexcept
{
return
QuantMode
(
BaseType
(
1u
)
<<
1
);
}
static
constexpr
QuantMode
activations
()
noexcept
{
return
QuantMode
(
BaseType
(
1u
)
<<
2
);
}
static
constexpr
QuantMode
perChannelScaling
()
noexcept
{
return
QuantMode
(
BaseType
(
1u
)
<<
3
);
}
static
constexpr
QuantMode
perTokenScaling
()
noexcept
{
return
QuantMode
(
BaseType
(
1u
)
<<
4
);
}
static
constexpr
QuantMode
perGroupScaling
()
noexcept
{
return
QuantMode
(
BaseType
(
1u
)
<<
5
);
}
static
constexpr
QuantMode
int8KvCache
()
noexcept
{
return
QuantMode
(
BaseType
(
1u
)
<<
6
);
}
static
constexpr
QuantMode
fp8KvCache
()
noexcept
{
return
QuantMode
(
BaseType
(
1u
)
<<
7
);
}
static
constexpr
QuantMode
fp8Qdq
()
noexcept
{
return
QuantMode
(
BaseType
(
1u
)
<<
8
);
}
static
constexpr
QuantMode
fp8RowWise
()
noexcept
{
return
QuantMode
(
BaseType
(
1u
)
<<
3
|
BaseType
(
1u
)
<<
4
|
BaseType
(
1u
)
<<
9
);
}
static
constexpr
QuantMode
w4a8QServe
()
noexcept
{
return
QuantMode
(
BaseType
(
1u
)
<<
10
);
}
constexpr
BaseType
value
()
const
noexcept
{
return
mValue
;
}
constexpr
bool
isSet
(
QuantMode
const
&
mode
)
const
noexcept
{
return
(
mValue
&
mode
.
value
())
==
mode
.
value
();
}
constexpr
bool
hasInt4Weights
()
const
noexcept
{
return
isSet
(
int4Weights
());
}
constexpr
bool
hasInt8Weights
()
const
noexcept
{
return
isSet
(
int8Weights
());
}
constexpr
bool
hasActivations
()
const
noexcept
{
return
isSet
(
activations
());
}
constexpr
bool
hasPerChannelScaling
()
const
noexcept
{
return
isSet
(
perChannelScaling
());
}
constexpr
bool
hasPerTokenScaling
()
const
noexcept
{
return
isSet
(
perTokenScaling
());
}
constexpr
bool
hasPerGroupScaling
()
const
noexcept
{
return
isSet
(
perGroupScaling
());
}
constexpr
bool
hasStaticActivationScaling
()
const
noexcept
{
return
!
hasPerTokenScaling
();
}
constexpr
bool
hasInt8KvCache
()
const
noexcept
{
return
isSet
(
int8KvCache
());
}
constexpr
bool
hasFp8KvCache
()
const
noexcept
{
return
isSet
(
fp8KvCache
());
}
constexpr
bool
hasFp8Qdq
()
const
noexcept
{
return
isSet
(
fp8Qdq
());
}
constexpr
bool
hasFp8RowWise
()
const
noexcept
{
return
isSet
(
fp8RowWise
());
}
constexpr
bool
hasKvCacheQuant
()
const
noexcept
{
return
hasInt8KvCache
()
||
hasFp8KvCache
();
}
static
constexpr
QuantMode
fromDescription
(
bool
quantizeWeights
=
false
,
bool
quantizeActivations
=
false
,
bool
perToken
=
false
,
bool
perChannel
=
false
,
bool
perGroup
=
false
,
bool
useInt4Weights
=
false
,
bool
useInt8KvCache
=
false
,
bool
useFp8KvCache
=
false
,
bool
useFp8Qdq
=
false
,
bool
useFp8RowWise
=
false
,
bool
useW4a8QServe
=
false
)
{
QuantMode
quantMode
{};
if
(
quantizeWeights
)
{
if
(
useInt4Weights
)
quantMode
+=
int4Weights
();
else
quantMode
+=
int8Weights
();
}
if
(
quantizeActivations
)
{
quantMode
+=
activations
();
}
if
(
perChannel
)
{
quantMode
+=
QuantMode
::
perChannelScaling
();
}
if
(
perToken
)
{
quantMode
+=
QuantMode
::
perTokenScaling
();
}
if
(
perGroup
)
{
quantMode
+=
QuantMode
::
perGroupScaling
();
}
if
(
useInt8KvCache
)
{
quantMode
+=
int8KvCache
();
}
if
(
useFp8KvCache
)
{
quantMode
+=
fp8KvCache
();
}
if
(
useFp8Qdq
)
{
quantMode
+=
fp8Qdq
();
}
if
(
useFp8RowWise
)
{
quantMode
+=
fp8RowWise
();
}
if
(
useW4a8QServe
)
{
quantMode
+=
w4a8QServe
();
}
return
quantMode
;
}
static
constexpr
QuantMode
useSmoothQuant
(
bool
perToken
=
false
,
bool
perChannel
=
false
)
{
return
fromDescription
(
true
,
true
,
perToken
,
perChannel
);
}
static
constexpr
QuantMode
useQServe
(
bool
perGroup
)
{
return
fromDescription
(
true
,
true
,
false
,
false
,
perGroup
,
true
,
false
,
false
,
false
,
false
,
true
);
}
static
constexpr
QuantMode
useWeightOnly
(
bool
useInt4Weights
=
false
,
bool
perGroup
=
false
)
{
return
fromDescription
(
true
,
false
,
false
,
false
,
perGroup
,
useInt4Weights
);
}
static
QuantMode
const
fromQuantAlgo
(
std
::
optional
<
std
::
string
>
quantAlgo
=
std
::
nullopt
,
std
::
optional
<
std
::
string
>
kvCacheQuantAlgo
=
std
::
nullopt
)
{
QuantMode
quantMode
{};
if
(
quantAlgo
==
"W8A16"
)
{
quantMode
=
useWeightOnly
(
false
,
false
);
}
else
if
(
quantAlgo
==
"W4A16"
)
{
quantMode
=
useWeightOnly
(
true
,
false
);
}
else
if
(
quantAlgo
==
"W4A16_AWQ"
)
{
quantMode
=
useWeightOnly
(
true
,
true
);
}
else
if
(
quantAlgo
==
"W4A8_AWQ"
)
{
quantMode
=
useWeightOnly
(
true
,
true
);
}
else
if
(
quantAlgo
==
"W4A8_QSERVE_PER_GROUP"
)
{
quantMode
=
useQServe
(
false
);
}
else
if
(
quantAlgo
==
"W4A8_QSERVE_PER_CHANNEL"
)
{
quantMode
=
useQServe
(
true
);
}
else
if
(
quantAlgo
==
"W4A16_GPTQ"
)
{
quantMode
=
useWeightOnly
(
true
,
true
);
}
else
if
(
quantAlgo
==
"W8A8_SQ_PER_CHANNEL"
)
{
quantMode
=
useSmoothQuant
(
false
,
true
);
}
else
if
(
quantAlgo
==
"W8A8_SQ_PER_TENSOR_PLUGIN"
)
{
quantMode
=
useSmoothQuant
(
false
,
false
);
}
else
if
(
quantAlgo
==
"W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN"
)
{
quantMode
=
useSmoothQuant
(
true
,
true
);
}
else
if
(
quantAlgo
==
"W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN"
)
{
quantMode
=
useSmoothQuant
(
false
,
true
);
}
else
if
(
quantAlgo
==
"W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN"
)
{
quantMode
=
useSmoothQuant
(
true
,
false
);
}
else
if
(
quantAlgo
==
"FP8"
)
{
quantMode
=
fromDescription
(
false
,
false
,
false
,
false
,
false
,
false
,
false
,
false
,
true
);
}
else
if
(
quantAlgo
==
"FP8_ROWWISE"
)
{
quantMode
=
fromDescription
(
false
,
false
,
true
,
true
,
false
,
false
,
false
,
false
,
false
,
true
);
}
if
(
kvCacheQuantAlgo
==
"INT8"
)
{
quantMode
+=
int8KvCache
();
}
else
if
(
kvCacheQuantAlgo
==
"FP8"
)
{
quantMode
+=
fp8KvCache
();
}
return
quantMode
;
}
constexpr
QuantMode
operator
+
(
QuantMode
const
&
other
)
const
noexcept
{
return
QuantMode
(
mValue
|
other
.
mValue
);
}
constexpr
QuantMode
&
operator
+=
(
QuantMode
const
&
other
)
noexcept
{
return
*
this
=
*
this
+
other
;
}
constexpr
QuantMode
operator
-
(
QuantMode
const
&
other
)
const
noexcept
{
return
QuantMode
(
mValue
&
~
other
.
mValue
);
}
constexpr
QuantMode
&
operator
-=
(
QuantMode
const
&
other
)
noexcept
{
return
*
this
=
*
this
-
other
;
}
constexpr
bool
operator
==
(
QuantMode
const
&
other
)
const
noexcept
{
return
mValue
==
other
.
mValue
;
}
constexpr
bool
operator
!=
(
QuantMode
const
&
other
)
const
noexcept
{
return
!
(
*
this
==
other
);
}
private:
BaseType
mValue
{
0
};
};
}
// namespace common
}
// namespace tensorrt_llm
sgl-kernel/3rdparty/tensorrt_llm/common/reduceKernelUtils.cuh
deleted
100644 → 0
View file @
9829e77e
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <array>
#include <assert.h>
#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))
#include <cooperative_groups/reduce.h>
#else
#include <cooperative_groups.h>
#endif
#include "tensorrt_llm/common/cudaTypeUtils.cuh"
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <curand_kernel.h>
#include <float.h>
#include <type_traits>
namespace
cg
=
cooperative_groups
;
namespace
tensorrt_llm
{
namespace
common
{
template
<
int
VPT
>
struct
BytesToType
;
template
<
>
struct
BytesToType
<
1
>
{
using
type
=
uint8_t
;
};
template
<
>
struct
BytesToType
<
2
>
{
using
type
=
uint16_t
;
};
template
<
>
struct
BytesToType
<
4
>
{
using
type
=
uint32_t
;
};
template
<
>
struct
BytesToType
<
8
>
{
using
type
=
uint64_t
;
};
template
<
>
struct
BytesToType
<
16
>
{
using
type
=
float4
;
};
template
<
int
Bytes
>
__device__
inline
void
copy
(
void
const
*
local
,
void
*
data
)
{
using
T
=
typename
BytesToType
<
Bytes
>::
type
;
T
const
*
in
=
static_cast
<
T
const
*>
(
local
);
T
*
out
=
static_cast
<
T
*>
(
data
);
*
out
=
*
in
;
}
static
float
constexpr
HALF_FLT_MAX
=
65504.
F
;
#define FINAL_MASK 0xffffffff
template
<
typename
T
>
__inline__
__device__
T
warpReduceSum
(
T
val
)
{
#pragma unroll
for
(
int
mask
=
16
;
mask
>
0
;
mask
>>=
1
)
val
=
add
<
T
>
(
val
,
__shfl_xor_sync
(
FINAL_MASK
,
val
,
mask
,
32
));
//__shfl_sync bf16 return float when sm < 80
return
val
;
}
/* Calculate the sum of all elements in a block */
template
<
typename
T
>
__inline__
__device__
T
blockReduceSum
(
T
val
)
{
static
__shared__
T
shared
[
32
];
int
lane
=
threadIdx
.
x
&
0x1f
;
int
wid
=
threadIdx
.
x
>>
5
;
val
=
warpReduceSum
<
T
>
(
val
);
if
(
lane
==
0
)
shared
[
wid
]
=
val
;
__syncthreads
();
// Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
// blockDim.x is not divided by 32
val
=
(
threadIdx
.
x
<
(
blockDim
.
x
/
32.
f
))
?
shared
[
lane
]
:
(
T
)
(
0.0
f
);
val
=
warpReduceSum
<
T
>
(
val
);
return
val
;
}
template
<
typename
T
>
__inline__
__device__
T
warpReduceMax
(
T
val
)
{
#pragma unroll
for
(
int
mask
=
16
;
mask
>
0
;
mask
>>=
1
)
val
=
max
(
val
,
__shfl_xor_sync
(
FINAL_MASK
,
val
,
mask
,
32
));
return
val
;
}
/* Calculate the maximum of all elements in a block */
template
<
typename
T
>
__inline__
__device__
T
blockReduceMax
(
T
val
)
{
static
__shared__
T
shared
[
32
];
int
lane
=
threadIdx
.
x
&
0x1f
;
// in-warp idx
int
wid
=
threadIdx
.
x
>>
5
;
// warp idx
val
=
warpReduceMax
(
val
);
// get maxx in each warp
if
(
lane
==
0
)
// record in-warp maxx by warp Idx
shared
[
wid
]
=
val
;
__syncthreads
();
// Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
// blockDim.x is not divided by 32
val
=
(
threadIdx
.
x
<
(
blockDim
.
x
/
32.
f
))
?
shared
[
lane
]
:
-
1e20
f
;
val
=
warpReduceMax
(
val
);
return
val
;
}
/* Calculate the maximum of all elements in a block */
template
<
typename
T
>
__inline__
__device__
T
blockAllReduceMax
(
T
val
)
{
static
__shared__
T
shared
[
32
];
int
lane
=
threadIdx
.
x
&
0x1f
;
// in-warp idx
int
wid
=
threadIdx
.
x
>>
5
;
// warp idx
val
=
warpReduceMax
(
val
);
// get maxx in each warp
if
(
lane
==
0
)
// record in-warp maxx by warp Idx
shared
[
wid
]
=
val
;
__syncthreads
();
// Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
// blockDim.x is not divided by 32
val
=
(
lane
<
(
blockDim
.
x
/
32.
f
))
?
shared
[
lane
]
:
-
1e20
f
;
val
=
warpReduceMax
(
val
);
return
val
;
}
template
<
typename
T
,
int
NUM
>
__inline__
__device__
T
warpReduceSumV2
(
T
*
val
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
NUM
;
i
++
)
{
#pragma unroll
for
(
int
mask
=
16
;
mask
>
0
;
mask
>>=
1
)
val
[
i
]
+=
__shfl_xor_sync
(
FINAL_MASK
,
val
[
i
],
mask
,
32
);
}
return
(
T
)
(
0.0
f
);
}
template
<
typename
T
,
int
NUM
>
__inline__
__device__
T
blockReduceSumV2
(
T
*
val
)
{
static
__shared__
T
shared
[
NUM
][
33
];
int
lane
=
threadIdx
.
x
&
0x1f
;
int
wid
=
threadIdx
.
x
>>
5
;
warpReduceSumV2
<
T
,
NUM
>
(
val
);
if
(
lane
==
0
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
NUM
;
i
++
)
{
shared
[
i
][
wid
]
=
val
[
i
];
}
}
__syncthreads
();
bool
is_mask
=
threadIdx
.
x
<
(
blockDim
.
x
/
32.
f
);
#pragma unroll
for
(
int
i
=
0
;
i
<
NUM
;
i
++
)
{
val
[
i
]
=
is_mask
?
shared
[
i
][
lane
]
:
(
T
)
(
0.0
f
);
}
warpReduceSumV2
<
T
,
NUM
>
(
val
);
return
(
T
)
0.0
f
;
}
template
<
typename
T
,
int
NUM
>
__inline__
__device__
T
warpReduceMaxV2
(
T
*
val
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
NUM
;
i
++
)
{
#pragma unroll
for
(
int
mask
=
16
;
mask
>
0
;
mask
>>=
1
)
val
[
i
]
=
max
(
val
[
i
],
__shfl_xor_sync
(
FINAL_MASK
,
val
[
i
],
mask
,
32
));
}
return
(
T
)
(
0.0
f
);
}
template
<
typename
T
,
int
NUM
>
__inline__
__device__
T
blockReduceMaxV2
(
T
*
val
)
{
static
__shared__
T
shared
[
32
][
NUM
];
int
lane
=
threadIdx
.
x
&
0x1f
;
// in-warp idx
int
wid
=
threadIdx
.
x
>>
5
;
// warp idx
warpReduceMaxV2
<
T
,
NUM
>
(
val
);
// get maxx in each warp
if
(
lane
==
0
)
// record in-warp maxx by warp Idx
{
#pragma unroll
for
(
int
i
=
0
;
i
<
NUM
;
i
++
)
{
shared
[
wid
][
i
]
=
val
[
i
];
}
}
__syncthreads
();
// Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
// blockDim.x is not divided by 32
bool
is_mask
=
threadIdx
.
x
<
(
blockDim
.
x
/
32.
f
);
#pragma unroll
for
(
int
i
=
0
;
i
<
NUM
;
i
++
)
{
val
[
i
]
=
is_mask
?
shared
[
lane
][
i
]
:
(
T
)
-
1e20
f
;
}
warpReduceMaxV2
<
T
,
NUM
>
(
val
);
return
(
T
)
0.0
f
;
}
template
<
int
NUM
>
__inline__
__device__
void
cgBlockReduceSumElements
(
float
*
element_list
,
float
*
cgBlockReduceSumElements_shm
)
{
cg
::
thread_block
cta
=
cg
::
this_thread_block
();
cg
::
thread_block_tile
<
32
>
tile
=
cg
::
tiled_partition
<
32
>
(
cta
);
int
const
tid
=
cta
.
thread_rank
();
int
const
blockz
=
blockDim
.
x
;
for
(
int
i
=
0
;
i
<
NUM
;
i
++
)
{
#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))
cgBlockReduceSumElements_shm
[
i
*
blockz
+
tid
]
=
cg
::
reduce
(
tile
,
element_list
[
i
],
cg
::
plus
<
float
>
());
#else
// TODO Add implementation here
if
(
threadIdx
.
x
==
0
&&
blockIdx
.
x
==
0
)
{
printf
(
"[ERROR] Not support cgBlockReduceSumElements when CUDA < 11
\n
"
);
assert
(
false
);
}
#endif
}
cg
::
sync
(
cta
);
if
(
tid
==
0
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
NUM
;
i
++
)
{
float
beta
=
0.0
f
;
for
(
int
j
=
0
;
j
<
blockz
;
j
+=
32
)
{
beta
+=
cgBlockReduceSumElements_shm
[
i
*
blockz
+
j
];
}
element_list
[
i
]
=
beta
;
}
}
}
template
<
typename
T
,
int
MAX_K
>
struct
TopK
{
int
p
[
MAX_K
];
// index, being -1 at the tail if the array is not full
T
u
[
MAX_K
];
// value in descend order, being -MAX_T_VAL if the element is invalid
__device__
__forceinline__
void
insert
(
T
const
elem
,
int
const
elem_id
)
{
if
(
elem_id
<
0
)
{
return
;
}
// Condition of updating the array
// 1. array is not full
// 2. elem is greater than the smallest (last) element in the array
// 3. elem is equal to the smallest (last) element in the array but its elem_id is smaller
bool
const
need_update
=
(
p
[
MAX_K
-
1
]
==
-
1
||
elem
>
u
[
MAX_K
-
1
]
||
elem
==
u
[
MAX_K
-
1
]
&&
elem_id
<
p
[
MAX_K
-
1
]);
if
(
!
need_update
)
{
return
;
}
// Find suitable index for the new element
int
i
;
for
(
i
=
MAX_K
-
2
;
i
>=
0
;
--
i
)
{
bool
const
need_decrease
=
(
p
[
i
]
==
-
1
||
elem
>
u
[
i
]
||
elem
==
u
[
i
]
&&
elem_id
<
p
[
i
]);
if
(
!
need_decrease
)
break
;
}
// Move elements to correct positions
for
(
int
k
=
MAX_K
-
2
;
k
>=
i
;
--
k
)
{
p
[
k
+
1
]
=
p
[
k
];
u
[
k
+
1
]
=
u
[
k
];
}
p
[
i
]
=
elem_id
;
u
[
i
]
=
elem
;
}
__device__
__forceinline__
void
init
()
{
T
const
MAX_T_VAL
=
(
std
::
is_same
<
T
,
half
>::
value
)
?
HALF_FLT_MAX
:
FLT_MAX
;
for
(
int
i
=
0
;
i
<
MAX_K
;
i
++
)
{
p
[
i
]
=
-
1
;
u
[
i
]
=
-
MAX_T_VAL
;
}
}
};
template
<
typename
T
,
int
MAX_K
>
__device__
__forceinline__
TopK
<
T
,
MAX_K
>
reduce_topk_op
(
TopK
<
T
,
MAX_K
>
const
&
a
,
TopK
<
T
,
MAX_K
>
const
&
b
)
{
TopK
<
T
,
MAX_K
>
res
=
a
;
for
(
int
i
=
0
;
i
<
MAX_K
;
++
i
)
res
.
insert
(
b
.
u
[
i
],
b
.
p
[
i
]);
return
res
;
}
template
<
typename
T
>
struct
TopK_2
{
int
p
=
-
1
;
T
u
=
-
((
std
::
is_same
<
T
,
half
>::
value
)
?
HALF_FLT_MAX
:
FLT_MAX
);
__device__
__forceinline__
void
insert
(
T
elem
,
int
elem_id
)
{
if
(
elem
>
u
)
{
u
=
elem
;
p
=
elem_id
;
}
}
__device__
__forceinline__
void
init
()
{
u
=
-
((
std
::
is_same
<
T
,
half
>::
value
)
?
HALF_FLT_MAX
:
FLT_MAX
);
p
=
-
1
;
}
};
template
<
typename
T
>
__device__
__forceinline__
TopK_2
<
T
>
reduce_topk_op_2
(
TopK_2
<
T
>
const
&
a
,
TopK_2
<
T
>
const
&
b
)
{
return
a
.
u
>
b
.
u
?
a
:
b
;
}
template
<
typename
T
>
__device__
__forceinline__
T
clamp_inf_for_half
(
float
const
input
)
{
return
input
;
}
template
<
>
__device__
__forceinline__
half
clamp_inf_for_half
(
float
const
input
)
{
// clamp inf values to enable fp16 training
return
input
>
0.0
f
?
(
half
)
min
(
input
,
HALF_FLT_MAX
-
1000
)
:
(
half
)
max
(
input
,
-
HALF_FLT_MAX
+
1000
);
}
}
// namespace common
}
// namespace tensorrt_llm
sgl-kernel/3rdparty/tensorrt_llm/common/stringUtils.cpp
deleted
100644 → 0
View file @
9829e77e
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "tensorrt_llm/common/stringUtils.h"
#include "tensorrt_llm/common/assert.h"
#include <cerrno>
#include <cstdarg>
#include <cstring>
#include <iostream>
#include <string>
namespace
tensorrt_llm
::
common
{
namespace
{
std
::
string
vformat
(
char
const
*
fmt
,
va_list
args
)
{
va_list
args0
;
va_copy
(
args0
,
args
);
auto
const
size
=
vsnprintf
(
nullptr
,
0
,
fmt
,
args0
);
if
(
size
<=
0
)
return
""
;
std
::
string
stringBuf
(
size
,
char
{});
auto
const
size2
=
std
::
vsnprintf
(
&
stringBuf
[
0
],
size
+
1
,
fmt
,
args
);
TLLM_CHECK_WITH_INFO
(
size2
==
size
,
std
::
string
(
std
::
strerror
(
errno
)));
return
stringBuf
;
}
}
// namespace
std
::
string
fmtstr
(
char
const
*
format
,
...)
{
va_list
args
;
va_start
(
args
,
format
);
std
::
string
result
=
vformat
(
format
,
args
);
va_end
(
args
);
return
result
;
};
std
::
unordered_set
<
std
::
string
>
str2set
(
std
::
string
const
&
input
,
char
delimiter
)
{
std
::
unordered_set
<
std
::
string
>
values
;
if
(
!
input
.
empty
())
{
std
::
stringstream
valStream
(
input
);
std
::
string
val
;
while
(
std
::
getline
(
valStream
,
val
,
delimiter
))
{
if
(
!
val
.
empty
())
{
values
.
insert
(
val
);
}
}
}
return
values
;
};
}
// namespace tensorrt_llm::common
sgl-kernel/3rdparty/tensorrt_llm/common/stringUtils.h
deleted
100644 → 0
View file @
9829e77e
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#if ENABLE_BF16
#include <cuda_bf16.h>
#endif // ENABLE_BF16
#include <cuda_fp16.h>
#include <memory> // std::make_unique
#include <sstream> // std::stringstream
#include <string>
#include <unordered_set>
#include <vector>
namespace
tensorrt_llm
::
common
{
#if ENABLE_BF16
static
inline
std
::
basic_ostream
<
char
>&
operator
<<
(
std
::
basic_ostream
<
char
>&
stream
,
__nv_bfloat16
const
&
val
)
{
stream
<<
__bfloat162float
(
val
);
return
stream
;
}
#endif // ENABLE_BF16
static
inline
std
::
basic_ostream
<
char
>&
operator
<<
(
std
::
basic_ostream
<
char
>&
stream
,
__half
const
&
val
)
{
stream
<<
__half2float
(
val
);
return
stream
;
}
inline
std
::
string
fmtstr
(
std
::
string
const
&
s
)
{
return
s
;
}
inline
std
::
string
fmtstr
(
std
::
string
&&
s
)
{
return
s
;
}
#if defined(_MSC_VER)
std
::
string
fmtstr
(
char
const
*
format
,
...);
#else
std
::
string
fmtstr
(
char
const
*
format
,
...)
__attribute__
((
format
(
printf
,
1
,
2
)));
#endif
// __PRETTY_FUNCTION__ is used for neat debugging printing but is not supported on Windows
// The alternative is __FUNCSIG__, which is similar but not identical
#if defined(_WIN32)
#define __PRETTY_FUNCTION__ __FUNCSIG__
#endif
auto
constexpr
kDefaultDelimiter
=
", "
;
template
<
typename
U
,
typename
TStream
,
typename
T
>
inline
TStream
&
arr2outCasted
(
TStream
&
out
,
T
*
arr
,
size_t
size
,
char
const
*
delim
=
kDefaultDelimiter
)
{
out
<<
"("
;
if
(
size
>
0
)
{
for
(
size_t
i
=
0
;
i
<
size
-
1
;
++
i
)
{
out
<<
static_cast
<
U
>
(
arr
[
i
])
<<
delim
;
}
out
<<
static_cast
<
U
>
(
arr
[
size
-
1
]);
}
out
<<
")"
;
return
out
;
}
template
<
typename
TStream
,
typename
T
>
inline
TStream
&
arr2out
(
TStream
&
out
,
T
*
arr
,
size_t
size
,
char
const
*
delim
=
kDefaultDelimiter
)
{
return
arr2outCasted
<
T
>
(
out
,
arr
,
size
,
delim
);
}
template
<
typename
T
>
inline
std
::
string
arr2str
(
T
*
arr
,
size_t
size
,
char
const
*
delim
=
kDefaultDelimiter
)
{
std
::
stringstream
ss
;
return
arr2out
(
ss
,
arr
,
size
,
delim
).
str
();
}
template
<
typename
T
>
inline
std
::
string
vec2str
(
std
::
vector
<
T
>
const
&
vec
,
char
const
*
delim
=
kDefaultDelimiter
)
{
return
arr2str
(
vec
.
data
(),
vec
.
size
(),
delim
);
}
inline
bool
strStartsWith
(
std
::
string
const
&
str
,
std
::
string
const
&
prefix
)
{
return
str
.
rfind
(
prefix
,
0
)
==
0
;
}
/// @brief Split a string into a set of strings using a delimiter
std
::
unordered_set
<
std
::
string
>
str2set
(
std
::
string
const
&
input
,
char
delimiter
);
}
// namespace tensorrt_llm::common
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment