Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Lmdeploy
Commits
14ad512a
Commit
14ad512a
authored
Jul 01, 2024
by
gaoqiong
Browse files
增加awq 多卡支持
parent
6ba90df9
Changes
21
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1140 additions
and
157 deletions
+1140
-157
3rdparty/gpufusion/nccl.h
3rdparty/gpufusion/nccl.h
+805
-0
README.md
README.md
+41
-16
lmdeploy/cli/cli.py
lmdeploy/cli/cli.py
+5
-0
lmdeploy/turbomind/deploy/converter.py
lmdeploy/turbomind/deploy/converter.py
+6
-1
lmdeploy/turbomind/deploy/target_model/base.py
lmdeploy/turbomind/deploy/target_model/base.py
+3
-4
lmdeploy/turbomind/deploy/target_model/w4.py
lmdeploy/turbomind/deploy/target_model/w4.py
+12
-76
lmdeploy/turbomind/turbomind.py
lmdeploy/turbomind/turbomind.py
+34
-7
lmdeploy/version.py
lmdeploy/version.py
+1
-1
src/turbomind/kernels/gemm_s_f16/format.cu
src/turbomind/kernels/gemm_s_f16/format.cu
+64
-3
src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h
src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h
+5
-0
src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+101
-32
src/turbomind/models/llama/LlamaDecoderLayerWeight.h
src/turbomind/models/llama/LlamaDecoderLayerWeight.h
+2
-0
src/turbomind/models/llama/LlamaDenseWeight.h
src/turbomind/models/llama/LlamaDenseWeight.h
+1
-0
src/turbomind/models/llama/LlamaLinear.h
src/turbomind/models/llama/LlamaLinear.h
+15
-16
src/turbomind/models/llama/LlamaWeight.cc
src/turbomind/models/llama/LlamaWeight.cc
+18
-1
src/turbomind/models/llama/LlamaWeight.h
src/turbomind/models/llama/LlamaWeight.h
+2
-0
src/turbomind/python/bind.cpp
src/turbomind/python/bind.cpp
+5
-0
src/turbomind/triton_backend/llama/LlamaTritonModel.cc
src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+17
-0
src/turbomind/triton_backend/llama/LlamaTritonModel.h
src/turbomind/triton_backend/llama/LlamaTritonModel.h
+2
-0
src/turbomind/triton_backend/transformer_triton_backend.hpp
src/turbomind/triton_backend/transformer_triton_backend.hpp
+1
-0
No files found.
3rdparty/gpufusion/nccl.h
0 → 100644
View file @
14ad512a
/*************************************************************************
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_H_
#define NCCL_H_
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#if CUDART_VERSION >= 11000
#include <cuda_bf16.h>
#endif
#define NCCL_MAJOR 2
#define NCCL_MINOR 18
#define NCCL_PATCH 3
#define NCCL_SUFFIX ""
#define NCCL_VERSION_CODE 21803
#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
#define RCCL_BFLOAT16 1
#define RCCL_GATHER_SCATTER 1
#define RCCL_ALLTOALLV 1
#ifdef __cplusplus
extern
"C"
{
#endif
#include <limits.h>
/*! @brief Opaque handle to communicator
@details A communicator contains information required to facilitate collective communications calls */
typedef
struct
ncclComm
*
ncclComm_t
;
#define NCCL_COMM_NULL NULL
#define NCCL_UNIQUE_ID_BYTES 128
/*! @brief Opaque unique id used to initialize communicators
@details The ncclUniqueId must be passed to all participating ranks */
typedef
struct
{
char
internal
[
NCCL_UNIQUE_ID_BYTES
];
/*!< Opaque array>*/
}
ncclUniqueId
;
/*! @defgroup rccl_result_code Result Codes
@details The various result codes that RCCL API calls may return
@{ */
/*! @brief Result type
@details Return codes aside from ncclSuccess indicate that a call has failed */
typedef
enum
{
ncclSuccess
=
0
,
/*!< No error */
ncclUnhandledCudaError
=
1
,
/*!< Unhandled HIP error */
ncclSystemError
=
2
,
/*!< Unhandled system error */
ncclInternalError
=
3
,
/*!< Internal Error - Please report to RCCL developers */
ncclInvalidArgument
=
4
,
/*!< Invalid argument */
ncclInvalidUsage
=
5
,
/*!< Invalid usage */
ncclRemoteError
=
6
,
/*!< Remote process exited or there was a network error */
ncclInProgress
=
7
,
/*!< RCCL operation in progress */
ncclNumResults
=
8
/*!< Number of result types */
}
ncclResult_t
;
/*! @} */
#define NCCL_CONFIG_UNDEF_INT INT_MIN
#define NCCL_CONFIG_UNDEF_PTR NULL
#define NCCL_SPLIT_NOCOLOR -1
/*! @defgroup rccl_config_type Communicator Configuration
@details Structure that allows for customizing Communicator behavior via ncclCommInitRankConfig
@{ */
/*! @brief Communicator configuration
@details Users can assign value to attributes to specify the behavior of a communicator */
typedef
struct
ncclConfig_v21700
{
/* attributes that users should never touch. */
size_t
size
;
/*!< Should not be touched */
unsigned
int
magic
;
/*!< Should not be touched */
unsigned
int
version
;
/*!< Should not be touched */
/* attributes that users are able to customize. */
int
blocking
;
/*!< Whether or not calls should block or not */
int
cgaClusterSize
;
/*!< Cooperative group array cluster size */
int
minCTAs
;
/*!< Minimum number of cooperative thread arrays (blocks) */
int
maxCTAs
;
/*!< Maximum number of cooperative thread arrays (blocks) */
const
char
*
netName
;
/*!< Force NCCL to use a specfic network */
int
splitShare
;
/*!< Allow communicators to share resources */
}
ncclConfig_t
;
/* Config initializer must be assigned to initialize config structure when it is created.
* Not initialized config will result in an error. */
#define NCCL_CONFIG_INITIALIZER { \
sizeof(ncclConfig_t),
/* size */
\
0xcafebeef,
/* magic */
\
NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH),
/* version */
\
NCCL_CONFIG_UNDEF_INT,
/* blocking */
\
NCCL_CONFIG_UNDEF_INT,
/* cgaClusterSize */
\
NCCL_CONFIG_UNDEF_INT,
/* minCTAs */
\
NCCL_CONFIG_UNDEF_INT,
/* maxCTAs */
\
NCCL_CONFIG_UNDEF_PTR,
/* netName */
\
NCCL_CONFIG_UNDEF_INT
/* splitShare */
\
}
/*! @} */
/*! @defgroup rccl_api_version Version Information
@details API call that returns RCCL version
@{ */
/*! @brief Return the RCCL_VERSION_CODE of RCCL in the supplied integer.
@details This integer is coded with the MAJOR, MINOR and PATCH level of RCCL.
@return Result code. See @ref rccl_result_code for more details.
@param[out] version Pointer to where version will be stored */
ncclResult_t
ncclGetVersion
(
int
*
version
);
/*! @cond include_hidden */
ncclResult_t
pncclGetVersion
(
int
*
version
);
/*! @endcond */
/*! @} */
/*! @defgroup rccl_api_communicator Communicator Initialization/Destruction
@details API calls that operate on communicators.
Communicators objects are used to launch collective communication
operations. Unique ranks between 0 and N-1 must be assigned to
each HIP device participating in the same Communicator.
Using the same HIP device for multiple ranks of the same Communicator
is not supported at this time.
@{ */
/*! @brief Generates an ID for ncclCommInitRank.
@details Generates an ID to be used in ncclCommInitRank.
ncclGetUniqueId should be called once by a single rank and the
ID should be distributed to all ranks in the communicator before
using it as a parameter for ncclCommInitRank.
@return Result code. See @ref rccl_result_code for more details.
@param[out] uniqueId Pointer to where uniqueId will be stored */
ncclResult_t
ncclGetUniqueId
(
ncclUniqueId
*
uniqueId
);
/*! @cond include_hidden */
ncclResult_t
pncclGetUniqueId
(
ncclUniqueId
*
uniqueId
);
/*! @endcond */
/*! @brief Create a new communicator with config.
@details Create a new communicator (multi thread/process version) with a configuration
set by users. See @ref rccl_config_type for more details.
Each rank is associated to a CUDA device, which has to be set before calling
ncclCommInitRank.
@return Result code. See @ref rccl_result_code for more details.
@param[out] comm Pointer to created communicator
@param[in] nranks Total number of ranks participating in this communicator
@param[in] commId UniqueId required for initialization
@param[in] rank Current rank to create communicator for. [0 to nranks-1]
@param[in] config Pointer to communicator configuration */
ncclResult_t
ncclCommInitRankConfig
(
ncclComm_t
*
comm
,
int
nranks
,
ncclUniqueId
commId
,
int
rank
,
ncclConfig_t
*
config
);
/*! @cond include_hidden */
ncclResult_t
pncclCommInitRankConfig
(
ncclComm_t
*
comm
,
int
nranks
,
ncclUniqueId
commId
,
int
rank
,
ncclConfig_t
*
config
);
/*! @endcond */
/*! @brief Creates a new communicator (multi thread/process version).
@details Rank must be between 0 and nranks-1 and unique within a communicator clique.
Each rank is associated to a CUDA device, which has to be set before calling
ncclCommInitRank. ncclCommInitRank implicitly syncronizes with other ranks,
so it must be called by different threads/processes or use ncclGroupStart/ncclGroupEnd.
@return Result code. See @ref rccl_result_code for more details.
@param[out] comm Pointer to created communicator
@param[in] nranks Total number of ranks participating in this communicator
@param[in] commId UniqueId required for initialization
@param[in] rank Current rank to create communicator for */
ncclResult_t
ncclCommInitRank
(
ncclComm_t
*
comm
,
int
nranks
,
ncclUniqueId
commId
,
int
rank
);
/*! @cond include_hidden */
ncclResult_t
pncclCommInitRank
(
ncclComm_t
*
comm
,
int
nranks
,
ncclUniqueId
commId
,
int
rank
);
/*! @endcond */
/*! @brief Creates a clique of communicators (single process version).
@details This is a convenience function to create a single-process communicator clique.
Returns an array of ndev newly initialized communicators in comm.
comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
If devlist is NULL, the first ndev HIP devices are used.
Order of devlist defines user-order of processors within the communicator.
@return Result code. See @ref rccl_result_code for more details.
@param[out] comm Pointer to array of created communicators
@param[in] ndev Total number of ranks participating in this communicator
@param[in] devlist Array of GPU device indices to create for */
ncclResult_t
ncclCommInitAll
(
ncclComm_t
*
comm
,
int
ndev
,
const
int
*
devlist
);
/*! @cond include_hidden */
ncclResult_t
pncclCommInitAll
(
ncclComm_t
*
comm
,
int
ndev
,
const
int
*
devlist
);
/*! @endcond */
/*! @brief Finalize a communicator.
@details ncclCommFinalize flushes all issued communications
and marks communicator state as ncclInProgress. The state will change to ncclSuccess
when the communicator is globally quiescent and related resources are freed; then,
calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
itself) without blocking.
@return Result code. See @ref rccl_result_code for more details.
@param[in] comm Communicator to finalize */
ncclResult_t
ncclCommFinalize
(
ncclComm_t
comm
);
/*! @cond include_hidden */
ncclResult_t
pncclCommFinalize
(
ncclComm_t
comm
);
/*! @endcond */
/*! @brief Frees local resources associated with communicator object.
@details Destroy all local resources associated with the passed in communicator object
@return Result code. See @ref rccl_result_code for more details.
@param[in] comm Communicator to destroy */
ncclResult_t
ncclCommDestroy
(
ncclComm_t
comm
);
/*! @cond include_hidden */
ncclResult_t
pncclCommDestroy
(
ncclComm_t
comm
);
/*! @endcond */
/*! @brief Abort any in-progress calls and destroy the communicator object.
@details Frees resources associated with communicator object and aborts any operations
that might still be running on the device.
@return Result code. See @ref rccl_result_code for more details.
@param[in] comm Communicator to abort and destroy */
ncclResult_t
ncclCommAbort
(
ncclComm_t
comm
);
/*! @cond include_hidden */
ncclResult_t
pncclCommAbort
(
ncclComm_t
comm
);
/*! @endcond */
/*! @brief Create one or more communicators from an existing one.
@details Creates one or more communicators from an existing one.
Ranks with the same color will end up in the same communicator.
Within the new communicator, key will be used to order ranks.
NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
and will therefore return a NULL communicator.
If config is NULL, the new communicator will inherit the original communicator's configuration
@return Result code. See @ref rccl_result_code for more details.
@param[in] comm Original communicator object for this rank
@param[in] color Color to assign this rank
@param[in] key Key used to order ranks within the same new communicator
@param[out] newcomm Pointer to new communicator
@param[in] config Config file for new communicator. May be NULL to inherit from comm */
ncclResult_t
ncclCommSplit
(
ncclComm_t
comm
,
int
color
,
int
key
,
ncclComm_t
*
newcomm
,
ncclConfig_t
*
config
);
/*! @cond include_hidden */
ncclResult_t
pncclCommSplit
(
ncclComm_t
comm
,
int
color
,
int
key
,
ncclComm_t
*
newcomm
,
ncclConfig_t
*
config
);
/*! @endcond */
/*! @} */
/*! @defgroup rccl_api_errcheck Error Checking Calls
@details API calls that check for errors
@{ */
/*! @brief Returns a string for each result code.
@details Returns a human-readable string describing the given result code.
@return String containing description of result code.
@param[in] result Result code to get description for */
const
char
*
ncclGetErrorString
(
ncclResult_t
result
);
/*! @cond include_hidden */
const
char
*
pncclGetErrorString
(
ncclResult_t
result
);
/*! @endcond */
/*! @brief Returns mesage on last result that occured.
@details Returns a human-readable message of the last error that occurred.
@return String containing the last result
@param[in] comm is currently unused and can be set to NULL */
const
char
*
ncclGetLastError
(
ncclComm_t
comm
);
/*! @cond include_hidden */
const
char
*
pncclGetLastError
(
ncclComm_t
comm
);
/*! @endcond */
/*! @brief Checks whether the comm has encountered any asynchronous errors
@details Query whether the provided communicator has encountered any asynchronous errors
@return Result code. See @ref rccl_result_code for more details.
@param[in] comm Communicator to query
@param[out] asyncError Pointer to where result code will be stored */
ncclResult_t
ncclCommGetAsyncError
(
ncclComm_t
comm
,
ncclResult_t
*
asyncError
);
/*! @cond include_hidden */
ncclResult_t
pncclCommGetAsyncError
(
ncclComm_t
comm
,
ncclResult_t
*
asyncError
);
/*! @endcond */
/*! @} */
/*! @defgroup rccl_api_comminfo Communicator Information
@details API calls that query communicator information
@{ */
/*! @brief Gets the number of ranks in the communicator clique.
@details Returns the number of ranks in the communicator clique (as set during initialization)
@return Result code. See @ref rccl_result_code for more details.
@param[in] comm Communicator to query
@param[out] count Pointer to where number of ranks will be stored */
ncclResult_t
ncclCommCount
(
const
ncclComm_t
comm
,
int
*
count
);
/*! @cond include_hidden */
ncclResult_t
pncclCommCount
(
const
ncclComm_t
comm
,
int
*
count
);
/*~ @endcond */
/*! @brief Get the ROCm device index associated with a communicator
@details Returns the ROCm device number associated with the provided communicator.
@return Result code. See @ref rccl_result_code for more details.
@param[in] comm Communicator to query
@param[out] device Pointer to where the associated ROCm device index will be stored */
ncclResult_t
ncclCommCuDevice
(
const
ncclComm_t
comm
,
int
*
device
);
/*! @cond include_hidden */
ncclResult_t
pncclCommCuDevice
(
const
ncclComm_t
comm
,
int
*
device
);
/*! @endcond */
/*! @brief Get the rank associated with a communicator
@details Returns the user-ordered "rank" associated with the provided communicator.
@return Result code. See @ref rccl_result_code for more details.
@param[in] comm Communicator to query
@param[out] rank Pointer to where the associated rank will be stored */
ncclResult_t
ncclCommUserRank
(
const
ncclComm_t
comm
,
int
*
rank
);
/*! @cond include_hidden */
ncclResult_t
pncclCommUserRank
(
const
ncclComm_t
comm
,
int
*
rank
);
/*! @endcond */
/*! @} */
/*! @defgroup rccl_api_enumerations API Enumerations
@details Enumerations used by collective communication calls
@{ */
/*! @brief Dummy reduction enumeration
@details Dummy reduction enumeration used to determine value for ncclMaxRedOp */
typedef
enum
{
ncclNumOps_dummy
=
5
}
ncclRedOp_dummy_t
;
/*! @brief Reduction operation selector
@details Enumeration used to specify the various reduction operations
ncclNumOps is the number of built-in ncclRedOp_t values and serves as
the least possible value for dynamic ncclRedOp_t values constructed by
ncclRedOpCreate functions.
ncclMaxRedOp is the largest valid value for ncclRedOp_t and is defined
to be the largest signed value (since compilers are permitted to use
signed enums) that won't grow sizeof(ncclRedOp_t) when compared to previous
RCCL versions to maintain ABI compatibility. */
typedef
enum
{
ncclSum
=
0
,
/*!< Sum */
ncclProd
=
1
,
/*!< Product */
ncclMax
=
2
,
/*!< Max */
ncclMin
=
3
,
/*!< Min */
ncclAvg
=
4
,
/*!< Average */
ncclNumOps
=
5
,
/*!< Number of built-in reduction ops */
ncclMaxRedOp
=
0x7fffffff
>>
(
32
-
8
*
sizeof
(
ncclRedOp_dummy_t
))
/*!< Largest value for ncclRedOp_t */
}
ncclRedOp_t
;
/*! @brief Data types
@details Enumeration of the various supported datatype */
typedef
enum
{
ncclInt8
=
0
,
ncclChar
=
0
,
ncclUint8
=
1
,
ncclInt32
=
2
,
ncclInt
=
2
,
ncclUint32
=
3
,
ncclInt64
=
4
,
ncclUint64
=
5
,
ncclFloat16
=
6
,
ncclHalf
=
6
,
ncclFloat32
=
7
,
ncclFloat
=
7
,
ncclFloat64
=
8
,
ncclDouble
=
8
,
ncclBfloat16
=
9
,
ncclNumTypes
=
10
}
ncclDataType_t
;
/*! @} */
/*! @defgroup rccl_api_custom_redop Custom Reduction Operator
@details API calls relating to creation/destroying custom reduction operator
that pre-multiplies local source arrays prior to reduction
@{ */
/*! @brief Location and dereferencing logic for scalar arguments.
@details Enumeration specifying memory location of the scalar argument.
Based on where the value is stored, the argument will be dereferenced either
while the collective is running (if in device memory), or before the ncclRedOpCreate()
function returns (if in host memory). */
typedef
enum
{
ncclScalarDevice
=
0
,
/*!< Scalar is in device-visible memory */
ncclScalarHostImmediate
=
1
/*!< Scalar is in host-visible memory */
}
ncclScalarResidence_t
;
/*! @brief Create a custom pre-multiplier reduction operator
@details Creates a new reduction operator which pre-multiplies input values by a given
scalar locally before reducing them with peer values via summation. For use
only with collectives launched against *comm* and *datatype*. The
*residence* argument indicates how/when the memory pointed to by *scalar*
will be dereferenced. Upon return, the newly created operator's handle
is stored in *op*.
@return Result code. See @ref rccl_result_code for more details.
@param[out] op Pointer to where newly created custom reduction operator is to be stored
@param[in] scalar Pointer to scalar value.
@param[in] datatype Scalar value datatype
@param[in] residence Memory type of the scalar value
@param[in] comm Communicator to associate with this custom reduction operator */
ncclResult_t
ncclRedOpCreatePreMulSum
(
ncclRedOp_t
*
op
,
void
*
scalar
,
ncclDataType_t
datatype
,
ncclScalarResidence_t
residence
,
ncclComm_t
comm
);
/*! @cond include_hidden */
ncclResult_t
pncclRedOpCreatePreMulSum
(
ncclRedOp_t
*
op
,
void
*
scalar
,
ncclDataType_t
datatype
,
ncclScalarResidence_t
residence
,
ncclComm_t
comm
);
/*! @endcond */
/*! @brief Destroy custom reduction operator
@details Destroys the reduction operator *op*. The operator must have been created by
ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
destroyed as soon as the last RCCL function which is given that operator returns.
@return Result code. See @ref rccl_result_code for more details.
@param[in] op Custom reduction operator is to be destroyed
@param[in] comm Communicator associated with this reduction operator */
ncclResult_t
ncclRedOpDestroy
(
ncclRedOp_t
op
,
ncclComm_t
comm
);
/*! @cond include_hidden */
ncclResult_t
pncclRedOpDestroy
(
ncclRedOp_t
op
,
ncclComm_t
comm
);
/*! @endcond */
/*! @} */
/*! @defgroup rccl_collective_api Collective Communication Operations
@details Collective communication operations must be called separately for each
communicator in a communicator clique.
They return when operations have been enqueued on the HIP stream.
Since they may perform inter-CPU synchronization, each call has to be done
from a different thread or process, or need to use Group Semantics (see
below).
@{ */
/*! @brief Reduce
@details Reduces data arrays of length *count* in *sendbuff* into *recvbuff* using *op*
operation.
*recvbuff* may be NULL on all calls except for root device.
*root* is the rank (not the HIP device) where data will reside after the
operation is complete.
In-place operation will happen if sendbuff == recvbuff.
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendbuff Local device data buffer to be reduced
@param[out] recvbuff Data buffer where result is stored (only for *root* rank). May be null for other ranks.
@param[in] count Number of elements in every send buffer
@param[in] datatype Data buffer element datatype
@param[in] op Reduction operator type
@param[in] root Rank where result data array will be stored
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
ncclResult_t
ncclReduce
(
const
void
*
sendbuff
,
void
*
recvbuff
,
size_t
count
,
ncclDataType_t
datatype
,
ncclRedOp_t
op
,
int
root
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @cond include_hidden */
ncclResult_t
pncclReduce
(
const
void
*
sendbuff
,
void
*
recvbuff
,
size_t
count
,
ncclDataType_t
datatype
,
ncclRedOp_t
op
,
int
root
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @endcond */
/*! @brief (Deprecated) Broadcast (in-place)
@details Copies *count* values from *root* to all other devices.
root is the rank (not the CUDA device) where data resides before the
operation is started.
This operation is implicitly in-place.
@return Result code. See @ref rccl_result_code for more details.
@param[in,out] buff Input array on *root* to be copied to other ranks. Output array for all ranks.
@param[in] count Number of elements in data buffer
@param[in] datatype Data buffer element datatype
@param[in] root Rank owning buffer to be copied to others
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
ncclResult_t
ncclBcast
(
void
*
buff
,
size_t
count
,
ncclDataType_t
datatype
,
int
root
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @cond include_hidden */
ncclResult_t
pncclBcast
(
void
*
buff
,
size_t
count
,
ncclDataType_t
datatype
,
int
root
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @endcond */
/*! @brief Broadcast
@details Copies *count* values from *sendbuff* on *root* to *recvbuff* on all devices.
*root* is the rank (not the HIP device) where data resides before the operation is started.
*sendbuff* may be NULL on ranks other than *root*.
In-place operation will happen if *sendbuff* == *recvbuff*.
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendbuff Data array to copy (if *root*). May be NULL for other ranks
@param[in] recvbuff Data array to store received array
@param[in] count Number of elements in data buffer
@param[in] datatype Data buffer element datatype
@param[in] root Rank of broadcast root
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
ncclResult_t
ncclBroadcast
(
const
void
*
sendbuff
,
void
*
recvbuff
,
size_t
count
,
ncclDataType_t
datatype
,
int
root
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @cond include_hidden */
ncclResult_t
pncclBroadcast
(
const
void
*
sendbuff
,
void
*
recvbuff
,
size_t
count
,
ncclDataType_t
datatype
,
int
root
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @endcond */
/*! @brief All-Reduce
@details Reduces data arrays of length *count* in *sendbuff* using *op* operation, and
leaves identical copies of result on each *recvbuff*.
In-place operation will happen if sendbuff == recvbuff.
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendbuff Input data array to reduce
@param[out] recvbuff Data array to store reduced result array
@param[in] count Number of elements in data buffer
@param[in] datatype Data buffer element datatype
@param[in] op Reduction operator
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
ncclResult_t
ncclAllReduce
(
const
void
*
sendbuff
,
void
*
recvbuff
,
size_t
count
,
ncclDataType_t
datatype
,
ncclRedOp_t
op
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @cond include_hidden */
ncclResult_t
pncclAllReduce
(
const
void
*
sendbuff
,
void
*
recvbuff
,
size_t
count
,
ncclDataType_t
datatype
,
ncclRedOp_t
op
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @endcond */
/*! @brief Reduce-Scatter
@details Reduces data in *sendbuff* using *op* operation and leaves reduced result
scattered over the devices so that *recvbuff* on rank i will contain the i-th
block of the result.
Assumes sendcount is equal to nranks*recvcount, which means that *sendbuff*
should have a size of at least nranks*recvcount elements.
In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendbuff Input data array to reduce
@param[out] recvbuff Data array to store reduced result subarray
@param[in] recvcount Number of elements each rank receives
@param[in] datatype Data buffer element datatype
@param[in] op Reduction operator
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
ncclResult_t
ncclReduceScatter
(
const
void
*
sendbuff
,
void
*
recvbuff
,
size_t
recvcount
,
ncclDataType_t
datatype
,
ncclRedOp_t
op
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @cond include_hidden */
ncclResult_t
pncclReduceScatter
(
const
void
*
sendbuff
,
void
*
recvbuff
,
size_t
recvcount
,
ncclDataType_t
datatype
,
ncclRedOp_t
op
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @endcond */
/*! @brief All-Gather
@details Each device gathers *sendcount* values from other GPUs into *recvbuff*,
receiving data from rank i at offset i*sendcount.
Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
should have a size of at least nranks*sendcount elements.
In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendbuff Input data array to send
@param[out] recvbuff Data array to store the gathered result
@param[in] sendcount Number of elements each rank sends
@param[in] datatype Data buffer element datatype
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
ncclResult_t
ncclAllGather
(
const
void
*
sendbuff
,
void
*
recvbuff
,
size_t
sendcount
,
ncclDataType_t
datatype
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @cond include_hidden */
ncclResult_t
pncclAllGather
(
const
void
*
sendbuff
,
void
*
recvbuff
,
size_t
sendcount
,
ncclDataType_t
datatype
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @endcond */
/*! @brief Send
@details Send data from *sendbuff* to rank *peer*.
Rank *peer* needs to call ncclRecv with the same *datatype* and the same *count*
as this rank.
This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
need to progress concurrently to complete, they must be fused within a ncclGroupStart /
ncclGroupEnd section.
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendbuff Data array to send
@param[in] count Number of elements to send
@param[in] datatype Data buffer element datatype
@param[in] peer Peer rank to send to
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
ncclResult_t
ncclSend
(
const
void
*
sendbuff
,
size_t
count
,
ncclDataType_t
datatype
,
int
peer
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @cond include_hidden */
ncclResult_t
pncclSend
(
const
void
*
sendbuff
,
size_t
count
,
ncclDataType_t
datatype
,
int
peer
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @endcond */
/*! @brief Receive
@details Receive data from rank *peer* into *recvbuff*.
Rank *peer* needs to call ncclSend with the same datatype and the same count
as this rank.
This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
need to progress concurrently to complete, they must be fused within a ncclGroupStart/
ncclGroupEnd section.
@return Result code. See @ref rccl_result_code for more details.
@param[out] recvbuff Data array to receive
@param[in] count Number of elements to receive
@param[in] datatype Data buffer element datatype
@param[in] peer Peer rank to send to
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
ncclResult_t
ncclRecv
(
void
*
recvbuff
,
size_t
count
,
ncclDataType_t
datatype
,
int
peer
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @cond include_hidden */
ncclResult_t
pncclRecv
(
void
*
recvbuff
,
size_t
count
,
ncclDataType_t
datatype
,
int
peer
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @endcond */
/*! @brief Gather
@details Root device gathers *sendcount* values from other GPUs into *recvbuff*,
receiving data from rank i at offset i*sendcount.
Assumes recvcount is equal to nranks*sendcount, which means that *recvbuff*
should have a size of at least nranks*sendcount elements.
In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
*recvbuff* may be NULL on ranks other than *root*.
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendbuff Data array to send
@param[out] recvbuff Data array to receive into on *root*.
@param[in] sendcount Number of elements to send per rank
@param[in] datatype Data buffer element datatype
@param[in] root Rank that receives data from all other ranks
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
ncclResult_t
ncclGather
(
const
void
*
sendbuff
,
void
*
recvbuff
,
size_t
sendcount
,
ncclDataType_t
datatype
,
int
root
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @cond include_hidden */
ncclResult_t
pncclGather
(
const
void
*
sendbuff
,
void
*
recvbuff
,
size_t
sendcount
,
ncclDataType_t
datatype
,
int
root
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @endcond */
/*! @brief Scatter
@details Scattered over the devices so that recvbuff on rank i will contain the i-th
block of the data on root.
Assumes sendcount is equal to nranks*recvcount, which means that *sendbuff*
should have a size of at least nranks*recvcount elements.
In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendbuff Data array to send (on *root* rank). May be NULL on other ranks.
@param[out] recvbuff Data array to receive partial subarray into
@param[in] recvcount Number of elements to receive per rank
@param[in] datatype Data buffer element datatype
@param[in] root Rank that scatters data to all other ranks
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
ncclResult_t
ncclScatter
(
const
void
*
sendbuff
,
void
*
recvbuff
,
size_t
recvcount
,
ncclDataType_t
datatype
,
int
root
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @cond include_hidden */
ncclResult_t
pncclScatter
(
const
void
*
sendbuff
,
void
*
recvbuff
,
size_t
recvcount
,
ncclDataType_t
datatype
,
int
root
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @endcond */
/*! @brief All-To-All
@details Device (i) send (j)th block of data to device (j) and be placed as (i)th
block. Each block for sending/receiving has *count* elements, which means
that *recvbuff* and *sendbuff* should have a size of nranks*count elements.
In-place operation is NOT supported. It is the user's responsibility
to ensure that sendbuff and recvbuff are distinct.
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendbuff Data array to send (contains blocks for each other rank)
@param[out] recvbuff Data array to receive (contains blocks from each other rank)
@param[in] count Number of elements to send between each pair of ranks
@param[in] datatype Data buffer element datatype
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
ncclResult_t
ncclAllToAll
(
const
void
*
sendbuff
,
void
*
recvbuff
,
size_t
count
,
ncclDataType_t
datatype
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @cond include_hidden */
ncclResult_t
pncclAllToAll
(
const
void
*
sendbuff
,
void
*
recvbuff
,
size_t
count
,
ncclDataType_t
datatype
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @endcond */
/*! @brief All-To-Allv
@details Device (i) sends sendcounts[j] of data from offset sdispls[j]
to device (j). At the same time, device (i) receives recvcounts[j] of data
from device (j) to be placed at rdispls[j].
sendcounts, sdispls, recvcounts and rdispls are all measured in the units
of datatype, not bytes.
In-place operation will happen if sendbuff == recvbuff.
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendbuff Data array to send (contains blocks for each other rank)
@param[in] sendcounts Array containing number of elements to send to each participating rank
@param[in] sdispls Array of offsets into *sendbuff* for each participating rank
@param[out] recvbuff Data array to receive (contains blocks from each other rank)
@param[in] recvcounts Array containing number of elements to receive from each participating rank
@param[in] rdispls Array of offsets into *recvbuff* for each participating rank
@param[in] datatype Data buffer element datatype
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
ncclResult_t
ncclAllToAllv
(
const
void
*
sendbuff
,
const
size_t
sendcounts
[],
const
size_t
sdispls
[],
void
*
recvbuff
,
const
size_t
recvcounts
[],
const
size_t
rdispls
[],
ncclDataType_t
datatype
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @cond include_hidden */
ncclResult_t
pncclAllToAllv
(
const
void
*
sendbuff
,
const
size_t
sendcounts
[],
const
size_t
sdispls
[],
void
*
recvbuff
,
const
size_t
recvcounts
[],
const
size_t
rdispls
[],
ncclDataType_t
datatype
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @endcond */
/*! @} */
/*! @defgroup msccl_api MSCCL Algorithm
@details API calls relating to the optional MSCCL algorithm datapath
@{ */
/*! @brief Opaque handle to MSCCL algorithm */
typedef
int
mscclAlgoHandle_t
;
/*! @brief MSCCL Load Algorithm
@details Load MSCCL algorithm file specified in mscclAlgoFilePath and return
its handle via mscclAlgoHandle. This API is expected to be called by MSCCL
scheduler instead of end users.
@return Result code. See @ref rccl_result_code for more details.
@param[in] mscclAlgoFilePath Path to MSCCL algorithm file
@param[out] mscclAlgoHandle Returned handle to MSCCL algorithm
@param[in] rank Current rank */
ncclResult_t
mscclLoadAlgo
(
const
char
*
mscclAlgoFilePath
,
mscclAlgoHandle_t
*
mscclAlgoHandle
,
int
rank
);
/*! @cond include_hidden */
ncclResult_t
pmscclLoadAlgo
(
const
char
*
mscclAlgoFilePath
,
mscclAlgoHandle_t
*
mscclAlgoHandle
,
int
rank
);
/*! @endcond */
/*! @brief MSCCL Run Algorithm
@details Run MSCCL algorithm specified by mscclAlgoHandle. The parameter
list merges all possible parameters required by different operations as this
is a general-purposed API. This API is expected to be called by MSCCL
scheduler instead of end users.
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendBuff Data array to send
@param[in] sendCounts Array containing number of elements to send to each participating rank
@param[in] sDisPls Array of offsets into *sendbuff* for each participating rank
@param[out] recvBuff Data array to receive
@param[in] recvCounts Array containing number of elements to receive from each participating rank
@param[in] rDisPls Array of offsets into *recvbuff* for each participating rank
@param[in] count Number of elements
@param[in] dataType Data buffer element datatype
@param[in] root Root rank index
@param[in] peer Peer rank index
@param[in] op Reduction operator
@param[in] mscclAlgoHandle Handle to MSCCL algorithm
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
ncclResult_t
mscclRunAlgo
(
const
void
*
sendBuff
,
const
size_t
sendCounts
[],
const
size_t
sDisPls
[],
void
*
recvBuff
,
const
size_t
recvCounts
[],
const
size_t
rDisPls
[],
size_t
count
,
ncclDataType_t
dataType
,
int
root
,
int
peer
,
ncclRedOp_t
op
,
mscclAlgoHandle_t
mscclAlgoHandle
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @cond include_hidden */
ncclResult_t
pmscclRunAlgo
(
const
void
*
sendBuff
,
const
size_t
sendCounts
[],
const
size_t
sDisPls
[],
void
*
recvBuff
,
const
size_t
recvCounts
[],
const
size_t
rDisPls
[],
size_t
count
,
ncclDataType_t
dataType
,
int
root
,
int
peer
,
ncclRedOp_t
op
,
mscclAlgoHandle_t
mscclAlgoHandle
,
ncclComm_t
comm
,
cudaStream_t
stream
);
/*! @endcond */
/*! @brief MSCCL Unload Algorithm
@details Unload MSCCL algorithm previous loaded using its handle. This API
is expected to be called by MSCCL scheduler instead of end users.
@return Result code. See @ref rccl_result_code for more details.
@param[in] mscclAlgoHandle Handle to MSCCL algorithm to unload
*/
ncclResult_t
mscclUnloadAlgo
(
mscclAlgoHandle_t
mscclAlgoHandle
);
/*! @cond include_hidden */
ncclResult_t
pmscclUnloadAlgo
(
mscclAlgoHandle_t
mscclAlgoHandle
);
/*! @endcond */
/*! @} */
/*! @defgroup rccl_group_api Group semantics
@details When managing multiple GPUs from a single thread, and since RCCL collective
calls may perform inter-CPU synchronization, we need to "group" calls for
different ranks/devices into a single call.
Grouping RCCL calls as being part of the same collective operation is done
using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
collective calls until the ncclGroupEnd call, which will wait for all calls
to be complete. Note that for collective communication, ncclGroupEnd only
guarantees that the operations are enqueued on the streams, not that
the operation is effectively done.
Both collective communication and ncclCommInitRank can be used in conjunction
of ncclGroupStart/ncclGroupEnd, but not together.
Group semantics also allow to fuse multiple operations on the same device
to improve performance (for aggregated collective calls), or to permit
concurrent progress of multiple send/receive operations.
@{ */
/*! @brief Group Start
@details Start a group call. All calls to RCCL until ncclGroupEnd will be fused into
a single RCCL operation. Nothing will be started on the HIP stream until
ncclGroupEnd.
@return Result code. See @ref rccl_result_code for more details. */
ncclResult_t
ncclGroupStart
();
/*! @cond include_hidden */
ncclResult_t
pncclGroupStart
();
/*! @endcond */
/*! @brief Group End
@details End a group call. Start a fused RCCL operation consisting of all calls since
ncclGroupStart. Operations on the HIP stream depending on the RCCL operations
need to be called after ncclGroupEnd.
@return Result code. See @ref rccl_result_code for more details. */
ncclResult_t
ncclGroupEnd
();
/*! @cond include_hidden */
ncclResult_t
pncclGroupEnd
();
/*! @endcond */
/*! @} */
#ifdef __cplusplus
}
// end extern "C"
#endif
#endif // end include guard
README.md
View file @
14ad512a
...
...
@@ -67,7 +67,9 @@ yum install rapidjson
export NCCL_LIB_DIR=/opt/dtk/cuda/lib64
pip3 install -r requirements.txt
pip3 install urllib3==1.24
#apt-get 换源,添加清华源
apt-get install rapidjson-dev
#若安装不上则需要apt-get 换源,添加清华源
#添加清华源后更新
#vim /etc/apt/sources.list
#添加清华源如下:
...
...
@@ -75,17 +77,17 @@ pip3 install urllib3==1.24
#deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-updates main restricted universe multiverse
#deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-backports main restricted universe multiverse
#deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-security main restricted universe multiverse
#换源完成后进行更新
sudo apt-get update
apt-get install rapidjson-dev
#换源完成后进行更新再重新安装
#sudo apt-get update
# 执行nccl环境变量
export NCCL_LAUNCH_MODE=GROUP
```
注:
1、docker启动 -v /opt/hyhal:/opt/hyhal 这个变量不能少
2、gpufusion wget指令提供的网址可能会有变化,可以进入提供网页下载对应压缩工具包
1、docker启动 -v /opt/hyhal:/opt/hyhal 这个变量不能少
2、gpufusion wget指令提供的网址可能会有变化,可以进入提供网页下载对应压缩工具包
3、若使用DTK24041 pytorch镜像中进行编译,其中镜像中dtk自带有gpufusion文件,在此项目编译过程总需要更换其中一个文件,lmdeploy/3rdparty/gpufusion/nccl.h 放入 /opt/dtk/cuda/include 路径下
#### 源码编译安装
-
代码下载
...
...
@@ -113,13 +115,16 @@ cd dist && pip3 install lmdeploy*
### 模型转换
```
bash
# <model_name> 模型的名字 (
'llama', 'internlm
', 'vicuna', 'wizardl
M
', 'internlm-chat-7b', 'internlm-chat', 'internlm-
chat-7b-8k
', 'internlm-chat
-20b
', 'internlm
-20b
', 'baichuan-7b', 'baichuan2-7b', 'puyu', 'llama2', '
qwen-7b
', 'qwen
-14b
', 'qwen-7
2
b',
'codellama
', 'solar', 'ultralm', 'ultracm', 'yi')
# <model_name> 模型的名字 (
['base', 'llama
', 'vicuna', 'wizardl
m', 'internlm
', 'internlm-
cha-
chat-7b
-8k
', 'internlm-chat
-20b
', 'internlm-
20b', 'internlm2-1_8b', 'internlm2-7b', 'internlm2-20b', 'internlm2
', 'internlm
2
-chat', 'internlm
2-cinternlm2-chat-20b', 'baichuan-base
', 'baichuan-7b',
'baichuan2',
'baichuan2-7b', 'puyu', 'llama2', '
llama-2', 'llama-2-chat
', 'qwen', 'qwen-7b',
ma', 'falcon', 'chatglm', 'chatglm2-6b', 'solar
', 'solar
-70b
', 'ultralm', 'ultracm', 'yi'
, 'yi-chat', 'yi-200k', 'yi-34b', 'Mistral-7B-Instruct',l', 'mixtral', 'gemma', 'deepseek', 'deepseek-chat', 'yi-vl']
)
# <model_path> 模型路径
# <model_format> 模型的格式 ('
llama
', 'hf',
None。可以不写默认None,代码会根据模型选择格式,一般选择不写
)
# <
model_form
at> 保存输出的目标路径(默认./workspace)
# <model_format> 模型的格式 ('
awq
', 'hf',
'llama'
)
# <
dst_p
at
h
> 保存输出的目标路径(默认./workspace)
# <tp> 用于张量并行的GPU数量应该是2^n
# <quant_model_path> AWQ量化模型
#若采用fp16模型
lmdeploy convert
${
model_name
}
${
model_path
}
--model-format
${
model_format
}
--dst-path
${
dst_path
}
--tp
${
tp
}
#若采用AWQ模型
lmdeploy convert
${
model_name
}
${
quant_model_path
}
--model-format
awq
--group-size
128
--tp
${
tp
}
--dst-path
${
dst_path
}
```
### 运行
#### bash界面运行
...
...
@@ -155,21 +160,41 @@ api-server的详细使用可以参照[这里](docs/zh_cn/serving)的文档
codellama模型的部署可以参照
[
codellama
](
docs/zh_cn/supported_models/codellama.md
)
##
#
AWQ 量化推理
## AWQ 量化推理
本版本支持量化推理功能,步骤如下:
```
bash
#group_size:按照模型量化时候的分组参数,一般为128
#采用数据量化
#可以根据需求采用需要的数据集进行量化,以下以c4作为数据集进行量化示例
#修改lmdeploy/lmdeploy/lite/utils/calib_dataloader.py get_c4()函数,更改为本地数据集路径
lmdeploy lite auto_awq
${
model_path
}
--calib-dataset
'c4'
--calib-samples
128
--calib-seqlen
2048
--w-bits
4
--w-group-size
128
--work-dir
${
quant_model_path
}
#group_size:按照模型量化时候的分组参数,仅支持128
#<tp> 用于张量并行的GPU数量应该是2^n
#<dst-path> 保存模型的目标文件夹
#step1:模型转换:
lmdeploy convert
${
model_name
}
${
model_path
}
--model
_
format
awq
--group-size
${
group_size
}
--tp
${
tp
}
--dst-path
${
dst_path
}
lmdeploy convert
${
model_name
}
${
quant_
model_path
}
--model
-
format
awq
--group-size
128
--tp
${
tp
}
--dst-path
${
dst_path
}
#step1:模型运行
lmdeploy chat turbomind
${
dst_path
}
--tp
${
tp
}
```
注意事项:
1.
该版本暂时仅支持tp=1 单卡量化推理,仅支持卡型KM-AI,暂不支持K100/Z100/Z100L;
2.
该版本量化推理功能仅支持先通过convert模型转换为turbomind格式,然后进行推理运行,暂时不知道hf模型直接量化推理;
3.
该版本暂时不支持通过数据集进行量化功能,需要在别处获取量化模型;
1.
该版本仅支持卡型KM-AI,暂不支持K100/Z100/Z100L;
2.
在进行benchmark测评时,AWQ模型不支持使用hf 模型直接进行评测,推荐先使用工具将量化模型转换为turbomind格式,且执行的tp数据需和模型转换时的tp指定数量一致;
3.
llama2-70b与qwen-72b模型在做数据集量化时,calib-samples参数推荐设置为120;
4.
多卡支持模型列表如下:
| 模型 | AWQ TP=1 | AWQ TP=2 | AWQ TP=4
| :----------: | :------: | :--: | :--: |
| Llama2-7B-chat | Yes | Yes | No |
| Llama2-13B-chat | Yes | Yes | Yes |
| Llama2-70B-chat | Yes | Yes | Yes |
| qwen-7B-chat | Yes | Yes | No |
| qwen-14B-chat | Yes | No | No |
| qwen-72B-chat | Yes | Yes | Yes |
备注:qwen-14b-chat模型不支持多卡AWQ量化推理原因为其中有size为[13696,5120]的gemm,当group_size为128时,scale shape为[13696/128,5120]=[107,5120],107不能被tp=2或者4整除。您可以依据此特点来判断您的模型能都支持AWQ多卡推理。
## result

...
...
lmdeploy/cli/cli.py
View file @
14ad512a
...
...
@@ -66,6 +66,11 @@ class CLI(object):
type
=
int
,
default
=
2
,
help
=
'A parameter used in AWQ to control the layout of weight '
)
parser
.
add_argument
(
'--w4-pad-size'
,
type
=
int
,
default
=
2
,
help
=
'A parameter used in AWQ to control the pad size of weight '
)
parser
.
set_defaults
(
run
=
CLI
.
convert
)
@
staticmethod
...
...
lmdeploy/turbomind/deploy/converter.py
View file @
14ad512a
...
...
@@ -197,6 +197,7 @@ def main(model_name: str,
quant_path
:
str
=
None
,
group_size
:
int
=
0
,
w4_weight_layout
:
int
=
2
,
w4_pad_size
:
int
=
2
,
**
kwargs
):
"""deploy llama family models via turbomind.
...
...
@@ -217,6 +218,7 @@ def main(model_name: str,
group_size (int): a parameter used in AWQ to quantize fp16 weights
to 4 bits
w4_weight_layout (int) :a parameter used in AWQ to control the layout of weight
w4_pad_size(int): a parameter used in AWQ to control the layout of weight
kwargs (dict): other params for convert
"""
...
...
@@ -263,12 +265,15 @@ def main(model_name: str,
cfg
.
rotary_embedding
=
cfg
.
size_per_head
cfg
.
group_size
=
group_size
cfg
.
w4_weight_layout
=
w4_weight_layout
cfg
.
w4_pad_size
=
w4_pad_size
if
inferred_model_format
.
find
(
'awq'
)
!=
-
1
:
cfg
.
weight_type
=
'int4'
output_format
=
'w4'
assert
group_size
>
0
,
f
'group_size:
{
group_size
}
should > 0'
print
(
"w4_weight_layout:"
,
w4_weight_layout
)
#
print("w4_weight_layout:",w4_weight_layout)
assert
w4_weight_layout
>=
0
and
w4_weight_layout
<
3
,
f
'w4_weight_layout:
{
w4_weight_layout
}
should >= 0 and < 3'
assert
w4_pad_size
>=
0
and
w4_pad_size
<
5
,
f
'w4_pad_size should >= 0 and <5'
else
:
#output_format = update_output_format(model_name, inferred_model_format,
# model_path, output_format)
...
...
lmdeploy/turbomind/deploy/target_model/base.py
View file @
14ad512a
...
...
@@ -54,6 +54,7 @@ class TurbomindModelConfig:
size_per_head
:
int
=
128
group_size
:
int
=
0
w4_weight_layout
:
int
=
2
w4_pad_size
:
int
=
2
max_batch_size
:
int
=
64
max_context_token_num
:
int
=
1
step_length
:
int
=
1
...
...
@@ -208,6 +209,7 @@ class BaseOutputModel(ABC):
param
=
param
.
to
(
torch_type
)
tprint
(
name
,
param
.
shape
)
_tofile
(
param
,
osp
.
join
(
self
.
out_dir
,
name
))
elif
len
(
self
.
tm_params
)
>
0
:
tm_params
=
self
.
tm_params
weight_type
=
self
.
cfg
.
weight_type
...
...
@@ -228,6 +230,7 @@ class BaseOutputModel(ABC):
torch_tensor
=
torch_tensor
.
float
()
for
tm_tensor
in
tm_params
[
name
]:
tm_tensor
.
copy_from
(
torch_tensor
)
tm_params
.
pop
(
name
)
else
:
tprint
(
'skip export'
,
name
,
param
.
shape
)
...
...
@@ -325,10 +328,6 @@ def permute(x: torch.Tensor, size_per_head: int = 128):
return
x
.
view
(
n_heads
,
2
,
dim
//
n_heads
//
2
,
1
).
transpose
(
1
,
2
).
reshape
(
dim
,
1
)
def
permute_trans
(
x
:
torch
.
Tensor
):
if
x
.
shape
[
-
1
]
>
1
:
dim
=
x
.
shape
[
-
1
]
return
x
.
view
(
-
1
,
x
.
shape
[
-
1
]).
transpose
(
0
,
1
).
reshape
(
dim
,
-
1
)
def
merge_qkv
(
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
,
tp
:
int
,
dim
:
int
):
...
...
lmdeploy/turbomind/deploy/target_model/w4.py
View file @
14ad512a
...
...
@@ -8,7 +8,7 @@ import lmdeploy
from
..source_model.base
import
BaseInputModel
,
BaseReader
from
.base
import
(
OUTPUT_MODELS
,
BaseOutputModel
,
TurbomindModelConfig
,
merge_qkv
,
permute
,
permute_trans
)
merge_qkv
,
permute
)
# import _turbomind as _tm
# TODO: find another way import _turbomind
...
...
@@ -117,6 +117,7 @@ class TurbomindW4Model(BaseOutputModel):
group_size
=
self
.
cfg
.
group_size
tp
=
self
.
cfg
.
tensor_para_size
w4_weight_layout
=
self
.
cfg
.
w4_weight_layout
w4_pad_size
=
self
.
cfg
.
w4_pad_size
size_per_head
=
self
.
cfg
.
size_per_head
# attn
q_qw
,
k_qw
,
v_qw
,
o_qw
=
get_cuda_tensor
(
bin
.
attn
(
i
))
...
...
@@ -134,48 +135,15 @@ class TurbomindW4Model(BaseOutputModel):
qkv_qz
=
merge_qkv
(
q_qz
,
k_qz
,
v_qz
,
tp
,
dim
=
2
)
qkv_s
=
merge_qkv
(
q_s
,
k_s
,
v_s
,
tp
,
dim
=
2
)
pad_group_count
=
2
if
w4_weight_layout
==
1
or
w4_weight_layout
==
2
:
if
qkv_qw
.
shape
[
0
]
%
4096
==
0
:
qkv_qw_padding
=
torch
.
zeros
(
group_size
*
pad_group_count
,
qkv_qw
.
shape
[
1
],
dtype
=
torch
.
int32
).
cuda
()
qkv_qw
=
torch
.
cat
((
qkv_qw
,
qkv_qw_padding
),
dim
=
0
).
contiguous
()
qkv_qz_padding
=
torch
.
zeros
(
pad_group_count
,
qkv_qz
.
shape
[
1
],
dtype
=
torch
.
int32
).
cuda
()
qkv_qz
=
torch
.
cat
((
qkv_qz
,
qkv_qz_padding
),
dim
=
0
).
contiguous
()
qkv_s_padding
=
torch
.
zeros
(
pad_group_count
,
qkv_s
.
shape
[
1
],
dtype
=
torch
.
float16
).
cuda
()
qkv_s
=
torch
.
cat
((
qkv_s
,
qkv_s_padding
),
dim
=
0
).
contiguous
()
qkv_qw
,
qkv_sz
=
convert_s4_
(
qkv_qw
,
qkv_qz
,
qkv_s
,
group_size
)
qkv_qw
=
tp_m_s4
(
qkv_qw
,
tp
)
qkv_sz
=
permute_trans
(
qkv_sz
)
else
:
qkv_qw
,
qkv_sz
=
convert_s4
(
qkv_qw
,
qkv_qz
,
qkv_s
,
group_size
)
qkv_qw
=
tp_m_s4
(
qkv_qw
,
tp
)
#print("请设置weight layout\n")
qkv_qw
,
qkv_sz
=
convert_s4
(
qkv_qw
,
qkv_qz
,
qkv_s
,
group_size
)
self
.
save_split
(
qkv_qw
,
f
'layers.
{
i
}
.attention.w_qkv.qweight'
,
-
1
)
self
.
save_split
(
qkv_sz
,
f
'layers.
{
i
}
.attention.w_qkv.scales_zeros'
,
-
1
)
if
w4_weight_layout
==
1
or
w4_weight_layout
==
2
:
if
o_qw
.
shape
[
0
]
%
4096
==
0
:
o_qw_padding
=
torch
.
zeros
(
group_size
*
pad_group_count
,
o_qw
.
shape
[
1
],
dtype
=
torch
.
int32
).
cuda
()
o_qw
=
torch
.
cat
((
o_qw
,
o_qw_padding
),
dim
=
0
).
contiguous
()
o_qz_padding
=
torch
.
zeros
(
pad_group_count
,
o_qz
.
shape
[
1
],
dtype
=
torch
.
int32
).
cuda
()
o_qz
=
torch
.
cat
((
o_qz
,
o_qz_padding
),
dim
=
0
).
contiguous
()
o_s_padding
=
torch
.
zeros
(
pad_group_count
,
o_s
.
shape
[
1
],
dtype
=
torch
.
float16
).
cuda
()
o_s
=
torch
.
cat
((
o_s
,
o_s_padding
),
dim
=
0
).
contiguous
()
o_qw
,
o_sz
=
convert_s4_
(
o_qw
,
o_qz
,
o_s
,
group_size
)
o_sz
=
permute_trans
(
o_sz
)
else
:
o_qw
,
o_sz
=
convert_s4
(
o_qw
,
o_qz
,
o_s
,
group_size
)
o_qw
,
o_sz
=
convert_s4
(
o_qw
,
o_qz
,
o_s
,
group_size
)
self
.
save_split
(
o_qw
,
f
'layers.
{
i
}
.attention.wo.qweight'
,
0
)
self
.
save_split
(
o_sz
,
f
'layers.
{
i
}
.attention.wo.scales_zeros'
,
0
)
q_b
,
k_b
,
v_b
,
o_b
=
get_cuda_tensor
(
bin
.
attn_bias
(
i
))
if
q_b
is
not
None
:
q_b
=
permute
(
q_b
,
size_per_head
)
...
...
@@ -184,6 +152,7 @@ class TurbomindW4Model(BaseOutputModel):
self
.
save_split
(
qkv_b
,
f
'layers.
{
i
}
.attention.w_qkv.bias'
,
-
1
)
self
.
save_split
(
o_b
,
f
'layers.
{
i
}
.attention.wo.bias'
,
copy
=
True
)
# ffn weights
w1_qw
,
w2_qw
,
w3_qw
=
get_cuda_tensor
(
bin
.
ffn
(
i
))
w1_qz
,
w2_qz
,
w3_qz
=
get_cuda_tensor
(
bin
.
ffn_zero
(
i
))
...
...
@@ -191,45 +160,12 @@ class TurbomindW4Model(BaseOutputModel):
w13_qw
,
w13_qz
,
w13_s
=
fuse_w1_w3_s4
(
w1_qw
,
w1_qz
,
w1_s
,
w3_qw
,
w3_qz
,
w3_s
)
if
w4_weight_layout
==
1
or
w4_weight_layout
==
2
:
if
w13_qw
.
shape
[
0
]
%
4096
==
0
:
w13_qw_padding
=
torch
.
zeros
(
group_size
*
pad_group_count
,
w13_qw
.
shape
[
1
],
dtype
=
torch
.
int32
).
cuda
()
w13_qw
=
torch
.
cat
((
w13_qw
,
w13_qw_padding
),
dim
=
0
).
contiguous
()
w13_qz_padding
=
torch
.
zeros
(
pad_group_count
,
w13_qz
.
shape
[
1
],
dtype
=
torch
.
int32
).
cuda
()
w13_qz
=
torch
.
cat
((
w13_qz
,
w13_qz_padding
),
dim
=
0
).
contiguous
()
w13_s_padding
=
torch
.
zeros
(
pad_group_count
,
w13_s
.
shape
[
1
],
dtype
=
torch
.
float16
).
cuda
()
w13_s
=
torch
.
cat
((
w13_s
,
w13_s_padding
),
dim
=
0
).
contiguous
()
w13_qw
,
w13_sz
=
convert_s4_
(
w13_qw
,
w13_qz
,
w13_s
,
group_size
)
w13_qw
=
tp_m_s4
(
w13_qw
,
tp
)
w13_sz
=
permute_trans
(
w13_sz
)
else
:
w13_qw
,
w13_sz
=
convert_s4
(
w13_qw
,
w13_qz
,
w13_s
,
group_size
)
w13_qw
=
tp_m_s4
(
w13_qw
,
tp
)
w13_qw
,
w13_sz
=
convert_s4
(
w13_qw
,
w13_qz
,
w13_s
,
group_size
)
self
.
save_split
(
w13_qw
,
f
'layers.
{
i
}
.feed_forward.w13.qweight'
,
-
1
)
self
.
save_split
(
w13_sz
,
f
'layers.
{
i
}
.feed_forward.w13.scales_zeros'
,
-
1
)
if
w4_weight_layout
==
1
or
w4_weight_layout
==
2
:
#pading
if
w2_qw
.
shape
[
0
]
%
4096
==
0
:
w2_qw_padding
=
torch
.
zeros
(
group_size
*
pad_group_count
,
w2_qw
.
shape
[
1
],
dtype
=
torch
.
int32
).
cuda
()
w2_qw
=
torch
.
cat
((
w2_qw
,
w2_qw_padding
),
dim
=
0
).
contiguous
()
w2_qz_padding
=
torch
.
zeros
(
pad_group_count
,
w2_qz
.
shape
[
1
],
dtype
=
torch
.
int32
).
cuda
()
w2_qz
=
torch
.
cat
((
w2_qz
,
w2_qz_padding
),
dim
=
0
).
contiguous
()
w2_s_padding
=
torch
.
zeros
(
pad_group_count
,
w2_s
.
shape
[
1
],
dtype
=
torch
.
float16
).
cuda
()
w2_s
=
torch
.
cat
((
w2_s
,
w2_s_padding
),
dim
=
0
).
contiguous
()
w2_qw
,
w2_sz
=
convert_s4_
(
w2_qw
,
w2_qz
,
w2_s
,
group_size
)
w2_sz
=
permute_trans
(
w2_sz
)
else
:
w2_qw
,
w2_sz
=
convert_s4
(
w2_qw
,
w2_qz
,
w2_s
,
group_size
)
self
.
save_split
(
w13_sz
,
f
'layers.
{
i
}
.feed_forward.w13.scales_zeros'
,
-
1
)
w2_qw
,
w2_sz
=
convert_s4
(
w2_qw
,
w2_qz
,
w2_s
,
group_size
)
self
.
save_split
(
w2_qw
,
f
'layers.
{
i
}
.feed_forward.w2.qweight'
,
0
)
self
.
save_split
(
w2_sz
,
f
'layers.
{
i
}
.feed_forward.w2.scales_zeros'
,
0
)
...
...
lmdeploy/turbomind/turbomind.py
View file @
14ad512a
...
...
@@ -148,6 +148,7 @@ class TurboMind:
model_format
:
Optional
[
str
]
=
None
,
group_size
:
Optional
[
int
]
=
None
,
w4_weight_layout
:
Optional
[
int
]
=
None
,
w4_pad_size
:
Optional
[
int
]
=
None
,
tp
:
Optional
[
int
]
=
None
,
chat_template_config
:
Optional
[
ChatTemplateConfig
]
=
None
,
**
kwargs
):
...
...
@@ -181,6 +182,7 @@ class TurboMind:
model_format
=
model_format
,
group_size
=
group_size
,
w4_weight_layout
=
w4_weight_layout
,
w4_pad_size
=
w4_pad_size
,
tp
=
tp
,
**
kwargs
)
...
...
@@ -237,6 +239,28 @@ class TurboMind:
threads
.
append
(
t
)
for
t
in
threads
:
t
.
join
()
def
_modify_weight
(
self
,
model_comm
):
"""modify weight if from_hf with awq."""
# TODO: support mpi
self
.
node_id
=
0
self
.
node_num
=
1
self
.
nccl_params
=
model_comm
.
create_nccl_params
(
self
.
node_id
)
torch
.
cuda
.
synchronize
()
def
_modify_weight_func
(
device_id
):
with
cuda_ctx
(
device_id
):
rank
=
self
.
node_id
*
self
.
gpu_count
+
device_id
model_comm
.
modify_shared_weights
(
device_id
,
rank
)
threads
=
[]
for
device_id
in
range
(
self
.
gpu_count
):
t
=
Thread
(
target
=
_modify_weight_func
,
args
=
(
device_id
,
))
t
.
start
()
threads
.
append
(
t
)
for
t
in
threads
:
t
.
join
()
def
_load_kv_qparams
(
self
,
model_path
,
tm_params
,
**
kwargs
):
"""Load kv qparams when loading from hf."""
...
...
@@ -271,10 +295,10 @@ class TurboMind:
t
.
join
()
for
_
in
range
(
self
.
gpu_count
):
tensor_map
=
que
.
get
()
tensor_map
=
que
.
get
()
for
k
,
v
in
tensor_map
.
items
():
if
k
not
in
tm_params
:
tm_params
[
k
]
=
[]
tm_params
[
k
]
=
[]
tm_params
[
k
].
append
(
v
)
def
_from_hf
(
self
,
model_source
:
ModelSource
,
model_path
:
str
,
...
...
@@ -307,10 +331,11 @@ class TurboMind:
data_type
=
'int4'
cfg
.
group_size
=
128
cfg
.
w4_weight_layout
=
2
cfg
.
w4_pad_size
=
0
else
:
#
output_format = update_output_format(cfg.model_name,
#
inferred_model_format,
#
model_path, output_format)
output_format
=
update_output_format
(
cfg
.
model_name
,
inferred_model_format
,
model_path
,
output_format
)
data_type
=
output_format
update_config_weight_type
(
output_format
,
cfg
)
...
...
@@ -342,12 +367,15 @@ class TurboMind:
# copy hf model weight to turbomind weight
tm_params
=
output_model
.
tm_params
self
.
_get_model_params
(
model_comm
,
tm_params
)
logger
.
warning
(
f
'get
{
len
(
tm_params
)
}
model params'
)
output_model
.
export
()
self
.
_modify_weight
(
model_comm
)
# load kv qparams
self
.
_load_kv_qparams
(
model_path
,
tm_params
,
kv_sym
=
False
,
kv_bits
=
8
)
assert
len
(
tm_params
)
==
0
,
f
'missing
{
tm_params
.
keys
()
}
'
return
model_comm
...
...
@@ -381,7 +409,6 @@ class TurboMind:
self
.
config
=
cfg
self
.
model_name
=
cfg
.
model_name
self
.
data_type
=
cfg
.
weight_type
#print("from_workspace_cfg:",cfg)
# create model
logger
.
warning
(
f
'model_config:
\n\n
{
cfg
.
toini
()
}
'
)
...
...
lmdeploy/version.py
View file @
14ad512a
# Copyright (c) OpenMMLab. All rights reserved.
from
typing
import
Tuple
__dcu_version__
=
'0.2.6'
__dcu_version__
=
'0.2.6
+das1.1.git7063377.abi0.dtk2404.torch2.1.0
'
__version__
=
'0.2.6'
short_version
=
__version__
...
...
src/turbomind/kernels/gemm_s_f16/format.cu
View file @
14ad512a
...
...
@@ -76,15 +76,75 @@ void reformat_s4_k_m8(uint32_t* dst, const uint32_t* src, int m, int k, cudaStre
permute_u4
<
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
><<<
512
,
512
,
0
,
st
>>>
(
dst
,
src
,
shape
);
}
template
<
typename
T
>
void
PrintData
(
cudaStream_t
stream
,
const
T
*
input
,
int
size
)
{
int
input_size
=
size
;
T
*
h_data
;
h_data
=
new
T
[
input_size
];
cudaMemcpy
(
h_data
,
input
,
input_size
*
sizeof
(
T
),
cudaMemcpyDeviceToHost
);
if
constexpr
(
std
::
is_same
<
T
,
half
>::
value
)
{
for
(
int
i
=
0
;
i
<
input_size
;
i
++
)
{
printf
(
"%f "
,
__half2float
(
h_data
[
i
]));
}
}
else
if
constexpr
(
std
::
is_same
<
T
,
half2
>::
value
)
{
for
(
int
i
=
0
;
i
<
input_size
;
i
++
)
{
printf
(
"x:%f y:%f "
,
__half2float
(
h_data
[
i
].
data
[
0
]),
__half2float
(
h_data
[
i
].
data
[
1
]));
}
}
else
if
constexpr
(
std
::
is_same
<
T
,
uint32_t
>::
value
)
{
for
(
int
i
=
0
;
i
<
input_size
;
i
++
)
{
printf
(
" %u "
,
h_data
[
i
]);
}
}
printf
(
"
\n
"
);
delete
[]
h_data
;
}
#define INSTANTIATEPRINTDATA(T) \
template void PrintData(cudaStream_t stream, const T* input,int size);
INSTANTIATEPRINTDATA
(
__half
)
INSTANTIATEPRINTDATA
(
float
)
INSTANTIATEPRINTDATA
(
half2
)
INSTANTIATEPRINTDATA
(
uint32_t
)
void
reformat_s4_k_m8_tarnsw4
(
uint32_t
*
dst
,
const
uint32_t
*
src
,
int
m
,
int
k
,
cudaStream_t
st
)
{
// permutation for [k, m/8] layout
Array
<
int
,
10
>
shape
{
1
,
k
/
8
,
2
,
2
,
2
,
1
,
m
/
8
,
2
,
2
,
2
};
// 0123456-->4,6,7,5,0,3,1,2
//permute_u4<4, 6, 7, 5, 0, 3, 1, 2><<<512, 512, 0, st>>>(dst, src, shape);
permute_u4
<
5
,
6
,
8
,
9
,
7
,
0
,
1
,
4
,
2
,
3
><<<
512
,
512
,
0
,
st
>>>
(
dst
,
src
,
shape
);
}
__global__
void
permute_u32
(
int
num_kernels
,
uint32_t
*
dst
,
const
uint32_t
*
src
,
int
m
,
int
k
)
{
//[k,m]-->[m,k]
int
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
id
>=
num_kernels
)
return
;
int
j
=
id
%
k
;
int
i
=
id
/
k
;
dst
[
id
]
=
src
[
j
*
m
+
i
];
}
void
reformat_s4_k_m8_tarnsscale
(
uint32_t
*
dst
,
const
uint32_t
*
src
,
int
m
,
int
k
,
cudaStream_t
st
)
{
// permutation for [k, m] layout
int
num_kernels
=
k
*
m
;
permute_u32
<<<
(
num_kernels
+
BLOCKSIZE
-
1
)
/
BLOCKSIZE
,
BLOCKSIZE
,
0
,
st
>>>
(
num_kernels
,
dst
,
src
,
m
,
k
);
}
__global__
void
dequantize_s4_offset_64
(
uint4
*
dst
,
const
uint32_t
*
src
,
size_t
count
)
{
for
(
int
i
=
threadIdx
.
x
+
blockDim
.
x
*
blockIdx
.
x
;
i
<
count
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
...
...
@@ -269,8 +329,7 @@ __global__ void input_padding_kernel(int num_kernels,T* output,const T* input,in
template
<
typename
T
>
void
input_padding
(
cudaStream_t
stream
,
T
*
output
,
const
T
*
input
,
int
m
,
int
k
,
int
group_size
,
int
pad_groupcount
)
{
//input的size是[m,k],output的size是[m,n+group_size]
//
int
num_kernels
=
m
*
(
k
+
pad_groupcount
*
group_size
);
input_padding_kernel
<<<
(
num_kernels
+
BLOCKSIZE
-
1
)
/
BLOCKSIZE
,
BLOCKSIZE
,
0
,
stream
>>>
(
num_kernels
,
output
,
input
,
m
,
k
,
group_size
,
pad_groupcount
);
}
...
...
@@ -282,3 +341,5 @@ template void input_padding(cudaStream_t stream, T* output,const T* input,int m,
INSTANTIATEINPUTPADING
(
__half
)
}
// namespace turbomind
src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h
View file @
14ad512a
...
...
@@ -36,6 +36,11 @@ void dequant_w4_gemm_colmajor(cudaStream_t stream, half* output,const uint32_t*
template
<
typename
T
>
void
input_padding
(
cudaStream_t
stream
,
T
*
output
,
const
T
*
input
,
int
m
,
int
k
,
int
group_size
,
int
pad_groupcount
);
void
reformat_s4_k_m8_tarnsw4
(
uint32_t
*
dst
,
const
uint32_t
*
src
,
int
m
,
int
k
,
cudaStream_t
st
);
void
reformat_s4_k_m8_tarnsscale
(
uint32_t
*
dst
,
const
uint32_t
*
src
,
int
m
,
int
k
,
cudaStream_t
st
);
template
<
typename
T
>
void
PrintData
(
cudaStream_t
stream
,
const
T
*
input
,
int
size
);
class
GemmS4F16
{
public:
GemmS4F16
();
...
...
src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
View file @
14ad512a
...
...
@@ -17,7 +17,7 @@
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.cc
#include "src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h"
#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/utils/logger.h"
...
...
@@ -42,6 +42,7 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(size_t head_num,
WeightType
weight_type
,
int
group_size
,
int
w4_weight_layout
,
int
w4_pad_size
,
bool
attn_bias
,
size_t
tensor_para_size
,
size_t
tensor_para_rank
)
:
...
...
@@ -60,36 +61,42 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(size_t head_num,
self_attn_weights
.
qkv
.
type
=
weight_type
;
self_attn_weights
.
qkv
.
group_size
=
group_size
;
self_attn_weights
.
qkv
.
w4_weight_layout
=
w4_weight_layout
;
self_attn_weights
.
qkv
.
w4_pad_size
=
w4_pad_size
;
self_attn_weights
.
output
.
input_dims
=
hidden_units_
/
tensor_para_size_
;
self_attn_weights
.
output
.
output_dims
=
hidden_units_
;
self_attn_weights
.
output
.
type
=
weight_type
;
self_attn_weights
.
output
.
group_size
=
group_size
;
self_attn_weights
.
output
.
w4_weight_layout
=
w4_weight_layout
;
self_attn_weights
.
output
.
w4_pad_size
=
w4_pad_size
;
ffn_weights
.
gating
.
input_dims
=
hidden_units_
;
ffn_weights
.
gating
.
output_dims
=
inter_size_
/
tensor_para_size_
;
ffn_weights
.
gating
.
type
=
weight_type
;
ffn_weights
.
gating
.
group_size
=
group_size
;
ffn_weights
.
gating
.
w4_weight_layout
=
w4_weight_layout
;
ffn_weights
.
gating
.
w4_pad_size
=
w4_pad_size
;
ffn_weights
.
intermediate
.
input_dims
=
hidden_units_
;
ffn_weights
.
intermediate
.
output_dims
=
inter_size_
/
tensor_para_size_
;
ffn_weights
.
intermediate
.
type
=
weight_type
;
ffn_weights
.
intermediate
.
group_size
=
group_size
;
ffn_weights
.
intermediate
.
w4_weight_layout
=
w4_weight_layout
;
ffn_weights
.
intermediate
.
w4_pad_size
=
w4_pad_size
;
ffn_weights
.
fused_gating_intermediate
.
input_dims
=
hidden_units_
;
ffn_weights
.
fused_gating_intermediate
.
output_dims
=
inter_size_
/
tensor_para_size_
*
2
;
ffn_weights
.
fused_gating_intermediate
.
type
=
weight_type
;
ffn_weights
.
fused_gating_intermediate
.
group_size
=
group_size
;
ffn_weights
.
fused_gating_intermediate
.
w4_weight_layout
=
w4_weight_layout
;
ffn_weights
.
fused_gating_intermediate
.
w4_pad_size
=
w4_pad_size
;
ffn_weights
.
output
.
input_dims
=
inter_size_
/
tensor_para_size_
;
ffn_weights
.
output
.
output_dims
=
hidden_units_
;
ffn_weights
.
output
.
type
=
weight_type
;
ffn_weights
.
output
.
group_size
=
group_size
;
ffn_weights
.
output
.
w4_weight_layout
=
w4_weight_layout
;
ffn_weights
.
output
.
w4_pad_size
=
w4_pad_size
;
mallocWeights
();
}
...
...
@@ -118,16 +125,9 @@ void mallocWeights(LlamaDenseWeight<T>& weights, bool bias)
else
{
// int8, int4
const
int
factor
=
sizeof
(
float
)
*
8
/
bit_size
;
FT_CHECK
(
weights
.
input_dims
%
factor
==
0
);
// //读环境变量
// int m_weightlayout_switch=1;
// const char* env_weightlayout_str = std::getenv("LMDEPLOY_WEIGHTLAYOUT_SWITCH");
// if (env_weightlayout_str != nullptr) {
// m_weightlayout_switch = std::stoi(env_weightlayout_str);
// }
if
((
weights
.
input_dims
%
4096
==
0
)
&&
(
weights
.
w4_weight_layout
==
1
||
weights
.
w4_weight_layout
==
2
))
{
size_t
new_input_dims
=
weights
.
input_dims
+
2
*
weights
.
group_size
;
size_t
new_input_dims
=
weights
.
input_dims
+
weights
.
w4_pad_size
*
weights
.
group_size
;
deviceMalloc
((
int
**
)
&
weights
.
kernel
,
new_input_dims
*
weights
.
output_dims
/
factor
);
deviceMemSetZero
((
int
*
)
weights
.
kernel
,
new_input_dims
*
weights
.
output_dims
/
factor
);
...
...
@@ -171,15 +171,10 @@ void getWeightTensor(LlamaDenseWeight<T>& weights, bool bias, const std::string&
}
else
{
// int8, int4
const
int
factor
=
sizeof
(
float
)
*
8
/
bit_size
;
// //读环境变量
// int m_weightlayout_switch=1;
// const char* env_weightlayout_str = std::getenv("LMDEPLOY_WEIGHTLAYOUT_SWITCH");
// if (env_weightlayout_str != nullptr) {
// m_weightlayout_switch = std::stoi(env_weightlayout_str);
// }
if
((
weights
.
input_dims
%
4096
==
0
)
&&
(
weights
.
w4_weight_layout
==
1
||
weights
.
w4_weight_layout
==
2
))
{
size_t
new_input_dims
=
weights
.
input_dims
+
weights
.
group_size
;
size_t
new_input_dims
=
weights
.
input_dims
+
weights
.
w4_pad_size
*
weights
.
group_size
;
output
.
insert
(
get_name
(
"qweight"
),
Tensor
{
MEMORY_GPU
,
...
...
@@ -189,7 +184,7 @@ void getWeightTensor(LlamaDenseWeight<T>& weights, bool bias, const std::string&
output
.
insert
(
get_name
(
"scales_zeros"
),
Tensor
{
MEMORY_GPU
,
getTensorType
<
T
>
(),
{
new_input_dims
/
weights
.
group_size
*
weights
.
output_dims
*
2
*
sizeof
(
T
)},
{
new_input_dims
*
weights
.
output_dims
/
weights
.
group_size
*
2
*
sizeof
(
T
)},
weights
.
scales_and_zeros
});
}
else
{
...
...
@@ -307,23 +302,36 @@ void loadWeights(LlamaDenseWeight<T>& w,
FT_CHECK
(
dim1
%
factor
==
0
);
// //读环境变量
// int m_weightlayout_switch=1;
// const char* env_weightlayout_str = std::getenv("LMDEPLOY_WEIGHTLAYOUT_SWITCH");
// if (env_weightlayout_str != nullptr) {
// m_weightlayout_switch = std::stoi(env_weightlayout_str);
// }
if
((
dim0
%
4096
==
0
)
&&
(
w
.
w4_weight_layout
==
1
||
w
.
w4_weight_layout
==
2
))
if
(
w
.
w4_weight_layout
==
1
||
w
.
w4_weight_layout
==
2
)
//需要转置
{
size_t
new_dim0
=
dim0
+
2
*
w
.
group_size
;
std
::
vector
<
size_t
>
w_shape
{
new_dim0
,
dim1
/
factor
*
sizeof
(
uint32_t
)};
loadWeightFromBin
((
int8_t
*
)
w
.
kernel
,
w_shape
,
prefix
+
".qweight"
,
FtCudaDataType
::
INT8
,
{});
const
size_t
group_count
=
w
.
group_size
>
0
?
new_dim0
/
w
.
group_size
:
1
;
loadWeightFromBin
((
half
*
)
w
.
scales_and_zeros
,
{
group_count
,
dim1
*
2
},
prefix
+
".scales_zeros"
,
type
,
{});
size_t
new_dim0
=
dim0
;
if
(
dim0
%
4096
==
0
)
new_dim0
=
dim0
+
w
.
w4_pad_size
*
w
.
group_size
;
//申请内存
int
*
kernel_workspace
=
nullptr
;
half
*
scales_workspace
=
nullptr
;
deviceMalloc
((
int
**
)
&
kernel_workspace
,
new_dim0
*
dim1
/
factor
);
deviceMemSetZero
((
int
*
)
kernel_workspace
,
new_dim0
*
dim1
/
factor
);
deviceMalloc
((
half
**
)
&
scales_workspace
,
new_dim0
/
w
.
group_size
*
dim1
*
2
);
//加载weight
std
::
vector
<
size_t
>
w_shape
{
dim0
,
dim1
/
factor
*
sizeof
(
uint32_t
)};
loadWeightFromBin
((
int8_t
*
)
kernel_workspace
,
w_shape
,
prefix
+
".qweight"
,
FtCudaDataType
::
INT8
,
{});
const
size_t
group_count
=
w
.
group_size
>
0
?
dim0
/
w
.
group_size
:
1
;
loadWeightFromBin
((
half
*
)
scales_workspace
,
{
group_count
,
dim1
*
2
},
prefix
+
".scales_zeros"
,
type
,
{});
//转置
reformat_s4_k_m8_tarnsw4
((
uint32_t
*
)
w
.
kernel
,(
uint32_t
*
)
kernel_workspace
,
dim1
,
new_dim0
,
0
);
reformat_s4_k_m8_tarnsscale
((
uint32_t
*
)
w
.
scales_and_zeros
,(
uint32_t
*
)
scales_workspace
,
dim1
,
new_dim0
/
w
.
group_size
,
0
);
//释放内存
cudaFree
(
kernel_workspace
);
cudaFree
(
scales_workspace
);
kernel_workspace
=
nullptr
;
scales_workspace
=
nullptr
;
}
else
{
std
::
vector
<
size_t
>
w_shape
{
dim0
,
dim1
/
factor
*
sizeof
(
uint32_t
)};
loadWeightFromBin
((
int8_t
*
)
w
.
kernel
,
w_shape
,
prefix
+
".qweight"
,
FtCudaDataType
::
INT8
,
{});
...
...
@@ -332,9 +340,57 @@ void loadWeights(LlamaDenseWeight<T>& w,
loadWeightFromBin
((
half
*
)
w
.
scales_and_zeros
,
{
group_count
,
dim1
*
2
},
prefix
+
".scales_zeros"
,
type
,
{});
}
//在这里进行weight的pad以及转置
}
}
template
<
typename
T
>
void
transWeights
(
LlamaDenseWeight
<
T
>&
w
,
FtCudaDataType
model_file_type
)
{
const
auto
type
=
model_file_type
;
size_t
dim0
=
w
.
input_dims
;
size_t
dim1
=
w
.
output_dims
;
const
size_t
bit_size
=
getBitSize
(
w
.
type
);
const
int
factor
=
sizeof
(
float
)
*
8
/
bit_size
;
FT_CHECK
(
dim1
%
factor
==
0
);
if
(
w
.
w4_weight_layout
==
1
||
w
.
w4_weight_layout
==
2
)
//需要转置
{
size_t
new_dim0
=
dim0
;
if
(
dim0
%
4096
==
0
)
new_dim0
=
dim0
+
w
.
w4_pad_size
*
w
.
group_size
;
//申请内存
int
*
kernel_workspace
=
nullptr
;
half
*
scales_workspace
=
nullptr
;
deviceMalloc
((
int
**
)
&
kernel_workspace
,
new_dim0
*
dim1
/
factor
);
deviceMemSetZero
((
int
*
)
kernel_workspace
,
new_dim0
*
dim1
/
factor
);
deviceMalloc
((
half
**
)
&
scales_workspace
,
new_dim0
/
w
.
group_size
*
dim1
*
2
);
deviceMemSetZero
((
half
*
)
scales_workspace
,
new_dim0
/
w
.
group_size
*
dim1
*
2
);
//拷贝加载weight
cudaD2Dcpy
((
int
*
)
kernel_workspace
,(
int
*
)
w
.
kernel
,
dim0
*
dim1
/
factor
);
cudaD2Dcpy
((
half
*
)
scales_workspace
,(
half
*
)
w
.
scales_and_zeros
,
dim0
/
w
.
group_size
*
dim1
*
2
);
//转置
reformat_s4_k_m8_tarnsw4
((
uint32_t
*
)
w
.
kernel
,(
uint32_t
*
)
kernel_workspace
,
dim1
,
new_dim0
,
0
);
reformat_s4_k_m8_tarnsscale
((
uint32_t
*
)
w
.
scales_and_zeros
,(
uint32_t
*
)
scales_workspace
,
dim1
,
new_dim0
/
w
.
group_size
,
0
);
//释放内存
cudaFree
(
kernel_workspace
);
cudaFree
(
scales_workspace
);
kernel_workspace
=
nullptr
;
scales_workspace
=
nullptr
;
}
}
template
<
typename
T
>
void
LlamaDecoderLayerWeight
<
T
>::
mallocWeights
()
{
...
...
@@ -420,6 +476,19 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
}
}
template
<
typename
T
>
void
LlamaDecoderLayerWeight
<
T
>::
modifyModel
(
FtCudaDataType
model_file_type
)
{
const
auto
rank_spec
=
std
::
to_string
(
tensor_para_rank_
);
const
auto
type
=
model_file_type
;
transWeights
(
self_attn_weights
.
qkv
,
type
);
transWeights
(
self_attn_weights
.
output
,
type
);
transWeights
(
ffn_weights
.
fused_gating_intermediate
,
type
);
transWeights
(
ffn_weights
.
output
,
type
);
}
template
<
typename
T
>
TensorMap
LlamaDecoderLayerWeight
<
T
>::
getParams
(
std
::
string
prefix
)
{
...
...
src/turbomind/models/llama/LlamaDecoderLayerWeight.h
View file @
14ad512a
...
...
@@ -36,6 +36,7 @@ public:
WeightType
weight_type
,
int
group_size
,
int
w4_weight_layout
,
int
w4_pad_size
,
bool
attn_bias
,
size_t
tensor_para_size
,
size_t
tensor_para_rank
);
...
...
@@ -44,6 +45,7 @@ public:
LlamaDecoderLayerWeight
&
operator
=
(
const
LlamaDecoderLayerWeight
&
other
)
=
delete
;
void
loadModel
(
std
::
string
dir_path
,
FtCudaDataType
model_file_type
);
void
modifyModel
(
FtCudaDataType
model_file_type
);
TensorMap
getParams
(
std
::
string
prefix
);
...
...
src/turbomind/models/llama/LlamaDenseWeight.h
View file @
14ad512a
...
...
@@ -64,6 +64,7 @@ struct LlamaDenseWeight {
T
*
scales_and_zeros
;
int
group_size
;
int
w4_weight_layout
;
int
w4_pad_size
;
};
template
<
typename
T
>
...
...
src/turbomind/models/llama/LlamaLinear.h
View file @
14ad512a
...
...
@@ -117,19 +117,19 @@ private:
//检查xpad空间是否足够
if
(
weight
.
input_dims
%
4096
==
0
)
//需要进行pad
{
int
pad_group_count
=
2
;
input_padding
(
stream_
,
reinterpret_cast
<
half
*>
(
cublas_wrapper_
->
xpading_workspace_
),(
const
T
*
)
input_data
,
batch_size
,
weight
.
input_dims
,
weight
.
group_size
,
pad_group_count
);
dequant_w4_gemm_colmajor
(
stream_
,
reinterpret_cast
<
T
*>
(
cublas_wrapper_
->
deweight_workspace_
),(
const
uint32_t
*
)
weight
.
kernel
,(
const
half2
*
)
weight
.
scales_and_zeros
,
weight
.
input_dims
+
pad_group_count
*
weight
.
group_size
,
weight
.
output_dims
,
weight
.
group_size
);
input_padding
(
stream_
,
reinterpret_cast
<
half
*>
(
cublas_wrapper_
->
xpading_workspace_
),(
const
T
*
)
input_data
,
batch_size
,
weight
.
input_dims
,
weight
.
group_size
,
weight
.
w4_pad_size
);
dequant_w4_gemm_colmajor
(
stream_
,
reinterpret_cast
<
T
*>
(
cublas_wrapper_
->
deweight_workspace_
),(
const
uint32_t
*
)
weight
.
kernel
,(
const
half2
*
)
weight
.
scales_and_zeros
,
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
,
weight
.
output_dims
,
weight
.
group_size
);
cublas_wrapper_
->
Gemm
(
CUBLAS_OP_T
,
CUBLAS_OP_N
,
weight
.
output_dims
,
//m
batch_size
,
//n
weight
.
input_dims
+
pad_group_count
*
weight
.
group_size
,
//k
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
,
//k
(
const
T
*
)
reinterpret_cast
<
T
*>
(
cublas_wrapper_
->
deweight_workspace_
),
//[]
weight
.
input_dims
+
pad_group_count
*
weight
.
group_size
,
//k
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
,
//k
(
const
T
*
)
cublas_wrapper_
->
xpading_workspace_
,
weight
.
input_dims
+
pad_group_count
*
weight
.
group_size
,
//k
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
,
//k
output_data
,
weight
.
output_dims
);
//m
}
...
...
@@ -155,8 +155,7 @@ private:
//检查ck workspace 的空间是否足够
if
(
weight
.
input_dims
%
4096
==
0
)
{
int
pad_groupcount
=
2
;
run_weight_only_gemm
(
reinterpret_cast
<
const
void
*>
(
input_data
),
reinterpret_cast
<
const
void
*>
(
weight
.
kernel
),
reinterpret_cast
<
const
void
*>
(
weight
.
scales_and_zeros
),
reinterpret_cast
<
void
*>
(
output_data
),
batch_size
,
weight
.
output_dims
,
(
weight
.
input_dims
),
(
weight
.
input_dims
),(
weight
.
input_dims
),
(
weight
.
input_dims
+
pad_groupcount
*
weight
.
group_size
),
weight
.
output_dims
,
weight
.
group_size
,
reinterpret_cast
<
void
*>
(
cublas_wrapper_
->
ck_workspace_
),
CK_WORKSPACE_SIZE
,(
hipStream_t
)
stream_
);
run_weight_only_gemm
(
reinterpret_cast
<
const
void
*>
(
input_data
),
reinterpret_cast
<
const
void
*>
(
weight
.
kernel
),
reinterpret_cast
<
const
void
*>
(
weight
.
scales_and_zeros
),
reinterpret_cast
<
void
*>
(
output_data
),
batch_size
,
weight
.
output_dims
,
(
weight
.
input_dims
),
(
weight
.
input_dims
),(
weight
.
input_dims
),
(
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
),
weight
.
output_dims
,
weight
.
group_size
,
reinterpret_cast
<
void
*>
(
cublas_wrapper_
->
ck_workspace_
),
CK_WORKSPACE_SIZE
,(
hipStream_t
)
stream_
);
}
// A B0 B1 C M N K strideA strideB strideBpad strideC group_size
else
{
...
...
@@ -208,19 +207,19 @@ private:
//检查xpad空间是否足够
if
(
weight
.
input_dims
%
4096
==
0
)
//需要进行pad
{
int
pad_group_count
=
2
;
input_padding
<
T
>
(
stream_
,
reinterpret_cast
<
half
*>
(
cublas_wrapper_
->
xpading_workspace_
),(
const
T
*
)
input_data
,
batch_size
,
weight
.
input_dims
,
weight
.
group_size
,
pad_group_count
);
dequant_w4_gemm_colmajor
(
stream_
,
reinterpret_cast
<
T
*>
(
cublas_wrapper_
->
deweight_workspace_
),(
const
uint32_t
*
)
weight
.
kernel
,(
const
half2
*
)
weight
.
scales_and_zeros
,
weight
.
input_dims
+
pad_group_count
*
weight
.
group_size
,
weight
.
output_dims
,
weight
.
group_size
);
input_padding
<
T
>
(
stream_
,
reinterpret_cast
<
half
*>
(
cublas_wrapper_
->
xpading_workspace_
),(
const
T
*
)
input_data
,
batch_size
,
weight
.
input_dims
,
weight
.
group_size
,
weight
.
w4_pad_size
);
dequant_w4_gemm_colmajor
(
stream_
,
reinterpret_cast
<
T
*>
(
cublas_wrapper_
->
deweight_workspace_
),(
const
uint32_t
*
)
weight
.
kernel
,(
const
half2
*
)
weight
.
scales_and_zeros
,
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
,
weight
.
output_dims
,
weight
.
group_size
);
cublas_wrapper_
->
Gemm
(
CUBLAS_OP_T
,
CUBLAS_OP_N
,
weight
.
output_dims
,
//m
batch_size
,
//n
weight
.
input_dims
+
pad_group_count
*
weight
.
group_size
,
//k
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
,
//k
(
const
T
*
)
reinterpret_cast
<
T
*>
(
cublas_wrapper_
->
deweight_workspace_
),
//[]
weight
.
input_dims
+
pad_group_count
*
weight
.
group_size
,
//k
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
,
//k
(
const
T
*
)
cublas_wrapper_
->
xpading_workspace_
,
weight
.
input_dims
+
pad_group_count
*
weight
.
group_size
,
//k
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
,
//k
output_tmp
,
weight
.
output_dims
);
//m
}
...
...
@@ -246,8 +245,8 @@ private:
if
(
weight
.
input_dims
%
4096
==
0
)
{
int
pad_groupcount
=
2
;
run_weight_only_gemm
(
reinterpret_cast
<
const
void
*>
(
input_data
),
reinterpret_cast
<
const
void
*>
(
weight
.
kernel
),
reinterpret_cast
<
const
void
*>
(
weight
.
scales_and_zeros
),
reinterpret_cast
<
void
*>
(
output_tmp
),
batch_size
,
weight
.
output_dims
,
(
weight
.
input_dims
),
(
weight
.
input_dims
),(
weight
.
input_dims
),
(
weight
.
input_dims
+
pad_groupcount
*
weight
.
group_size
),
weight
.
output_dims
,
weight
.
group_size
,
reinterpret_cast
<
void
*>
(
cublas_wrapper_
->
ck_workspace_
),
CK_WORKSPACE_SIZE
,(
hipStream_t
)
stream_
);
run_weight_only_gemm
(
reinterpret_cast
<
const
void
*>
(
input_data
),
reinterpret_cast
<
const
void
*>
(
weight
.
kernel
),
reinterpret_cast
<
const
void
*>
(
weight
.
scales_and_zeros
),
reinterpret_cast
<
void
*>
(
output_tmp
),
batch_size
,
weight
.
output_dims
,
(
weight
.
input_dims
),
(
weight
.
input_dims
),(
weight
.
input_dims
),
(
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
),
weight
.
output_dims
,
weight
.
group_size
,
reinterpret_cast
<
void
*>
(
cublas_wrapper_
->
ck_workspace_
),
CK_WORKSPACE_SIZE
,(
hipStream_t
)
stream_
);
}
// A B0 B1 C M N K strideA strideB strideBpad strideC group_size
else
{
...
...
src/turbomind/models/llama/LlamaWeight.cc
View file @
14ad512a
...
...
@@ -33,6 +33,7 @@ LlamaWeight<T>::LlamaWeight(size_t head_num,
WeightType
weight_type
,
int
group_size
,
int
w4_weight_layout
,
int
w4_pad_size
,
size_t
tensor_para_size
,
size_t
tensor_para_rank
)
:
hidden_units_
(
head_num
*
size_per_head
),
...
...
@@ -57,6 +58,7 @@ LlamaWeight<T>::LlamaWeight(size_t head_num,
weight_type_
,
group_size
,
w4_weight_layout
,
w4_pad_size
,
attn_bias
,
tensor_para_size_
,
tensor_para_rank_
));
...
...
@@ -69,7 +71,7 @@ LlamaWeight<T>::LlamaWeight(size_t head_num,
std
::
string
str_w4_weight_layout
=
std
::
to_string
(
w4_weight_layout
);
const
char
*
env_value
=
str_w4_weight_layout
.
c_str
();
setenv
(
env_name
,
env_value
,
1
);
//printf("set LMDEPLOY_WEIGHTLAYOUT_SWITCH env: %d \n",w4_weight_layout);
}
else
{
...
...
@@ -128,8 +130,23 @@ void LlamaWeight<T>::loadModel(std::string dir_path)
for
(
unsigned
layer
=
0
;
layer
<
num_layer_
;
++
layer
)
{
decoder_layer_weights
[
layer
]
->
loadModel
(
dir_path
+
"layers."
+
std
::
to_string
(
layer
),
model_file_type
);
}
}
template
<
typename
T
>
void
LlamaWeight
<
T
>::
modifyModel
()
{
FtCudaDataType
model_file_type
=
FtCudaDataType
::
FP16
;
if
(
weight_type_
==
WeightType
::
kBF16
){
model_file_type
=
FtCudaDataType
::
BF16
;
}
for
(
unsigned
layer
=
0
;
layer
<
num_layer_
;
++
layer
)
{
decoder_layer_weights
[
layer
]
->
modifyModel
(
model_file_type
);
}
}
template
<
typename
T
>
TensorMap
LlamaWeight
<
T
>::
getParams
()
{
...
...
src/turbomind/models/llama/LlamaWeight.h
View file @
14ad512a
...
...
@@ -38,6 +38,7 @@ struct LlamaWeight {
WeightType
weight_type
,
int
group_size
,
int
w4_weight_layout
,
int
w4_pad_size
,
size_t
tensor_para_size
,
size_t
tensor_para_rank
);
...
...
@@ -47,6 +48,7 @@ struct LlamaWeight {
LlamaWeight
&
operator
=
(
const
LlamaWeight
&
other
)
=
delete
;
void
loadModel
(
std
::
string
dir_path
);
void
modifyModel
();
TensorMap
getParams
();
...
...
src/turbomind/python/bind.cpp
View file @
14ad512a
...
...
@@ -439,6 +439,11 @@ PYBIND11_MODULE(_turbomind, m)
py
::
call_guard
<
py
::
gil_scoped_release
>
(),
"device_id"
_a
,
"rank"
_a
)
.
def
(
"modify_shared_weights"
,
&
AbstractTransformerModel
::
modifySharedWeights
,
py
::
call_guard
<
py
::
gil_scoped_release
>
(),
"device_id"
_a
,
"rank"
_a
)
.
def
(
"get_params"
,
[](
AbstractTransformerModel
*
model
,
int
deviceId
,
int
rank
)
{
...
...
src/turbomind/triton_backend/llama/LlamaTritonModel.cc
View file @
14ad512a
...
...
@@ -187,6 +187,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t tensor_para_size,
quant_policy_
=
reader
.
GetInteger
(
"llama"
,
"quant_policy"
,
0
);
group_size_
=
reader
.
GetInteger
(
"llama"
,
"group_size"
,
0
);
w4_weight_layout_
=
reader
.
GetInteger
(
"llama"
,
"w4_weight_layout"
,
2
);
w4_pad_size_
=
reader
.
GetInteger
(
"llama"
,
"w4_pad_size"
,
2
);
// rotary embedding parameters
attn_params_
.
rotary_embedding_dim
=
reader
.
GetInteger
(
"llama"
,
"rotary_embedding"
);
...
...
@@ -383,6 +384,7 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
weight_type_
,
group_size_
,
w4_weight_layout_
,
w4_pad_size_
,
tensor_para_size_
,
tensor_para_rank
);
// model inited with model_dir
...
...
@@ -392,6 +394,21 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
return
;
}
template
<
typename
T
>
void
LlamaTritonModel
<
T
>::
modifySharedWeights
(
int
device_id
,
int
rank
)
{
ft
::
check_cuda_error
(
cudaSetDevice
(
device_id
));
const
int
tensor_para_rank
=
rank
%
tensor_para_size_
;
const
int
pipeline_para_rank
=
rank
/
tensor_para_size_
;
ft
::
FT_CHECK
(
pipeline_para_size_
==
1
&&
pipeline_para_rank
==
0
);
if
(
weight_type_
==
turbomind
::
WeightType
::
kINT4
)
{
shared_weights_
[
device_id
]
->
modifyModel
();
}
return
;
}
template
<
typename
T
>
TensorMap
LlamaTritonModel
<
T
>::
getParams
(
int
deviceId
,
int
rank
)
{
...
...
src/turbomind/triton_backend/llama/LlamaTritonModel.h
View file @
14ad512a
...
...
@@ -53,6 +53,7 @@ struct LlamaTritonModel: public AbstractTransformerModel {
std
::
shared_ptr
<
ft
::
AbstractCustomComm
>
custom_all_reduce_comm
=
nullptr
)
override
;
void
createSharedWeights
(
int
deviceId
,
int
rank
)
override
;
void
modifySharedWeights
(
int
deviceId
,
int
rank
)
override
;
TensorMap
getParams
(
int
deviceId
,
int
rank
)
override
;
...
...
@@ -102,6 +103,7 @@ private:
int
quant_policy_
;
int
group_size_
;
int
w4_weight_layout_
;
int
w4_pad_size_
;
// shared weights for each device
std
::
vector
<
std
::
shared_ptr
<
ft
::
LlamaWeight
<
T
>>>
shared_weights_
;
...
...
src/turbomind/triton_backend/transformer_triton_backend.hpp
View file @
14ad512a
...
...
@@ -325,6 +325,7 @@ struct AbstractTransformerModel {
std
::
shared_ptr
<
ft
::
AbstractCustomComm
>
custom_all_reduce_comm
=
nullptr
)
=
0
;
virtual
void
createSharedWeights
(
int
deviceId
,
int
rank
)
=
0
;
virtual
void
modifySharedWeights
(
int
deviceId
,
int
rank
)
=
0
;
virtual
TensorMap
getParams
(
int
deviceId
,
int
rank
)
=
0
;
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment