增加awq 多卡支持

14ad512a · gaoqiong · 6ba90df9 · 14ad512a · 14ad512a · 14ad512a
Commit 14ad512a authored Jul 01, 2024 by gaoqiong
20 changed files
--- a/3rdparty/gpufusion/nccl.h
+++ b/3rdparty/gpufusion/nccl.h
+/*************************************************************************
+ * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_H_
+#define NCCL_H_
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#if CUDART_VERSION >= 11000
+#include <cuda_bf16.h>
+#endif
+
+#define NCCL_MAJOR 2
+#define NCCL_MINOR 18
+#define NCCL_PATCH 3
+#define NCCL_SUFFIX ""
+
+#define NCCL_VERSION_CODE 21803
+#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
+
+#define RCCL_BFLOAT16 1
+#define RCCL_GATHER_SCATTER 1
+#define RCCL_ALLTOALLV 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <limits.h>
+
+/*! @brief      Opaque handle to communicator
+    @details    A communicator contains information required to facilitate collective communications calls */
+typedef struct ncclComm* ncclComm_t;
+#define NCCL_COMM_NULL NULL
+
+#define NCCL_UNIQUE_ID_BYTES 128
+/*! @brief      Opaque unique id used to initialize communicators
+    @details    The ncclUniqueId must be passed to all participating ranks */
+typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; /*!< Opaque array>*/} ncclUniqueId;
+
+/*! @defgroup   rccl_result_code Result Codes
+    @details    The various result codes that RCCL API calls may return
+    @{ */
+
+/*! @brief      Result type
+    @details    Return codes aside from ncclSuccess indicate that a call has failed */
+  typedef enum {
+    ncclSuccess                 =  0, /*!< No error */
+    ncclUnhandledCudaError      =  1, /*!< Unhandled HIP error */
+    ncclSystemError             =  2, /*!< Unhandled system error */
+    ncclInternalError           =  3, /*!< Internal Error - Please report to RCCL developers */
+    ncclInvalidArgument         =  4, /*!< Invalid argument */
+    ncclInvalidUsage            =  5, /*!< Invalid usage */
+    ncclRemoteError             =  6, /*!< Remote process exited or there was a network error */
+    ncclInProgress              =  7, /*!< RCCL operation in progress */
+    ncclNumResults              =  8  /*!< Number of result types */
+  } ncclResult_t;
+/*! @} */
+
+#define NCCL_CONFIG_UNDEF_INT INT_MIN
+#define NCCL_CONFIG_UNDEF_PTR NULL
+#define NCCL_SPLIT_NOCOLOR -1
+
+/*! @defgroup   rccl_config_type Communicator Configuration
+    @details    Structure that allows for customizing Communicator behavior via ncclCommInitRankConfig
+    @{ */
+
+/*! @brief      Communicator configuration
+    @details    Users can assign value to attributes to specify the behavior of a communicator */
+typedef struct ncclConfig_v21700 {
+  /* attributes that users should never touch. */
+  size_t size;                 /*!< Should not be touched */
+  unsigned int magic;          /*!< Should not be touched */
+  unsigned int version;        /*!< Should not be touched */
+  /* attributes that users are able to customize. */
+  int blocking;                /*!< Whether or not calls should block or not */
+  int cgaClusterSize;          /*!< Cooperative group array cluster size */
+  int minCTAs;                 /*!< Minimum number of cooperative thread arrays (blocks) */
+  int maxCTAs;                 /*!< Maximum number of cooperative thread arrays (blocks) */
+  const char *netName;         /*!< Force NCCL to use a specfic network */
+  int splitShare;              /*!< Allow communicators to share resources */
+} ncclConfig_t;
+
+/* Config initializer must be assigned to initialize config structure when it is created.
+ * Not initialized config will result in an error. */
+#define NCCL_CONFIG_INITIALIZER {                                        \
+  sizeof(ncclConfig_t),                             /* size */           \
+  0xcafebeef,                                       /* magic */          \
+  NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */        \
+  NCCL_CONFIG_UNDEF_INT,                            /* blocking */       \
+  NCCL_CONFIG_UNDEF_INT,                            /* cgaClusterSize */ \
+  NCCL_CONFIG_UNDEF_INT,                            /* minCTAs */        \
+  NCCL_CONFIG_UNDEF_INT,                            /* maxCTAs */        \
+  NCCL_CONFIG_UNDEF_PTR,                            /* netName */        \
+  NCCL_CONFIG_UNDEF_INT                             /* splitShare */     \
+}
+/*! @} */
+
+/*! @defgroup   rccl_api_version Version Information
+    @details    API call that returns RCCL version
+    @{ */
+
+/*! @brief      Return the RCCL_VERSION_CODE of RCCL in the supplied integer.
+    @details    This integer is coded with the MAJOR, MINOR and PATCH level of RCCL.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[out] version       Pointer to where version will be stored */
+ncclResult_t  ncclGetVersion(int *version);
+/*! @cond       include_hidden */
+ncclResult_t pncclGetVersion(int *version);
+/*! @endcond */
+/*! @} */
+
+/*! @defgroup   rccl_api_communicator Communicator Initialization/Destruction
+    @details    API calls that operate on communicators.
+                Communicators objects are used to launch collective communication
+                operations.  Unique ranks between 0 and N-1 must be assigned to
+                each HIP device participating in the same Communicator.
+                Using the same HIP device for multiple ranks of the same Communicator
+                is not supported at this time.
+    @{ */
+
+/*! @brief      Generates an ID for ncclCommInitRank.
+    @details    Generates an ID to be used in ncclCommInitRank.
+                ncclGetUniqueId should be called once by a single rank and the
+                ID should be distributed to all ranks in the communicator before
+                using it as a parameter for ncclCommInitRank.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[out] uniqueId      Pointer to where uniqueId will be stored */
+ncclResult_t  ncclGetUniqueId(ncclUniqueId* uniqueId);
+/*! @cond       include_hidden */
+ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
+/*! @endcond */
+
+/*! @brief      Create a new communicator with config.
+    @details    Create a new communicator (multi thread/process version) with a configuration
+                set by users. See @ref rccl_config_type for more details.
+                Each rank is associated to a CUDA device, which has to be set before calling
+                ncclCommInitRank.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[out] comm          Pointer to created communicator
+    @param[in]  nranks        Total number of ranks participating in this communicator
+    @param[in]  commId        UniqueId required for initialization
+    @param[in]  rank          Current rank to create communicator for. [0 to nranks-1]
+    @param[in]  config        Pointer to communicator configuration */
+ncclResult_t  ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
+/*! @cond       include_hidden */
+ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
+/*! @endcond */
+
+/*! @brief      Creates a new communicator (multi thread/process version).
+    @details    Rank must be between 0 and nranks-1 and unique within a communicator clique.
+                Each rank is associated to a CUDA device, which has to be set before calling
+                ncclCommInitRank.  ncclCommInitRank implicitly syncronizes with other ranks,
+                so it must be called by different threads/processes or use ncclGroupStart/ncclGroupEnd.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[out] comm          Pointer to created communicator
+    @param[in]  nranks        Total number of ranks participating in this communicator
+    @param[in]  commId        UniqueId required for initialization
+    @param[in]  rank          Current rank to create communicator for */
+ncclResult_t  ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+/*! @cond       include_hidden */
+ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+/*! @endcond */
+
+/*! @brief      Creates a clique of communicators (single process version).
+    @details    This is a convenience function to create a single-process communicator clique.
+                Returns an array of ndev newly initialized communicators in comm.
+                comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
+                If devlist is NULL, the first ndev HIP devices are used.
+                Order of devlist defines user-order of processors within the communicator.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[out] comm          Pointer to array of created communicators
+    @param[in]  ndev          Total number of ranks participating in this communicator
+    @param[in]  devlist       Array of GPU device indices to create for */
+ncclResult_t  ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
+/*! @cond       include_hidden */
+ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
+/*! @endcond */
+
+/*! @brief      Finalize a communicator.
+    @details    ncclCommFinalize flushes all issued communications
+                and marks communicator state as ncclInProgress. The state will change to ncclSuccess
+                when the communicator is globally quiescent and related resources are freed; then,
+                calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
+                itself) without blocking.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[in]  comm          Communicator to finalize */
+ncclResult_t  ncclCommFinalize(ncclComm_t comm);
+/*! @cond       include_hidden */
+ncclResult_t pncclCommFinalize(ncclComm_t comm);
+/*! @endcond */
+
+/*! @brief      Frees local resources associated with communicator object.
+    @details    Destroy all local resources associated with the passed in communicator object
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[in]  comm          Communicator to destroy */
+ncclResult_t  ncclCommDestroy(ncclComm_t comm);
+/*! @cond       include_hidden */
+ncclResult_t pncclCommDestroy(ncclComm_t comm);
+/*! @endcond */
+
+/*! @brief      Abort any in-progress calls and destroy the communicator object.
+    @details    Frees resources associated with communicator object and aborts any operations
+                that might still be running on the device.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[in]  comm          Communicator to abort and destroy */
+ncclResult_t  ncclCommAbort(ncclComm_t comm);
+/*! @cond       include_hidden */
+ncclResult_t pncclCommAbort(ncclComm_t comm);
+/*! @endcond */
+
+/*! @brief      Create one or more communicators from an existing one.
+    @details    Creates one or more communicators from an existing one.
+                Ranks with the same color will end up in the same communicator.
+                Within the new communicator, key will be used to order ranks.
+                NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
+                and will therefore return a NULL communicator.
+                If config is NULL, the new communicator will inherit the original communicator's configuration
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[in]  comm          Original communicator object for this rank
+    @param[in]  color         Color to assign this rank
+    @param[in]  key           Key used to order ranks within the same new communicator
+    @param[out] newcomm       Pointer to new communicator
+    @param[in]  config        Config file for new communicator. May be NULL to inherit from comm */
+ncclResult_t  ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
+/*! @cond       include_hidden */
+ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
+/*! @endcond */
+/*! @} */
+
+/*! @defgroup   rccl_api_errcheck Error Checking Calls
+    @details    API calls that check for errors
+    @{ */
+
+/*! @brief      Returns a string for each result code.
+    @details    Returns a human-readable string describing the given result code.
+    @return     String containing description of result code.
+
+    @param[in]  result        Result code to get description for */
+const char*  ncclGetErrorString(ncclResult_t result);
+/*! @cond       include_hidden */
+const char* pncclGetErrorString(ncclResult_t result);
+/*! @endcond */
+
+/*! @brief      Returns mesage on last result that occured.
+    @details    Returns a human-readable message of the last error that occurred.
+    @return     String containing the last result
+
+    @param[in]  comm is currently unused and can be set to NULL */
+const char*  ncclGetLastError(ncclComm_t comm);
+/*! @cond       include_hidden */
+const char* pncclGetLastError(ncclComm_t comm);
+/*! @endcond */
+
+/*! @brief      Checks whether the comm has encountered any asynchronous errors
+    @details    Query whether the provided communicator has encountered any asynchronous errors
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[in]  comm          Communicator to query
+    @param[out] asyncError    Pointer to where result code will be stored */
+ncclResult_t  ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
+/*! @cond       include_hidden */
+ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
+/*! @endcond */
+/*! @} */
+
+/*! @defgroup   rccl_api_comminfo Communicator Information
+    @details    API calls that query communicator information
+    @{ */
+
+/*! @brief      Gets the number of ranks in the communicator clique.
+    @details    Returns the number of ranks in the communicator clique (as set during initialization)
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[in]  comm          Communicator to query
+    @param[out] count         Pointer to where number of ranks will be stored */
+ncclResult_t  ncclCommCount(const ncclComm_t comm, int* count);
+/*! @cond       include_hidden */
+ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
+/*~ @endcond */
+
+/*! @brief      Get the ROCm device index associated with a communicator
+    @details    Returns the ROCm device number associated with the provided communicator.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[in]  comm          Communicator to query
+    @param[out] device        Pointer to where the associated ROCm device index will be stored */
+ncclResult_t  ncclCommCuDevice(const ncclComm_t comm, int* device);
+/*! @cond       include_hidden */
+ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
+/*! @endcond */
+
+/*! @brief      Get the rank associated with a communicator
+    @details    Returns the user-ordered "rank" associated with the provided communicator.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[in]  comm          Communicator to query
+    @param[out] rank          Pointer to where the associated rank will be stored */
+ncclResult_t  ncclCommUserRank(const ncclComm_t comm, int* rank);
+/*! @cond       include_hidden */
+ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
+/*! @endcond */
+/*! @} */
+
+/*! @defgroup   rccl_api_enumerations API Enumerations
+    @details    Enumerations used by collective communication calls
+    @{ */
+
+/*! @brief      Dummy reduction enumeration
+    @details    Dummy reduction enumeration used to determine value for ncclMaxRedOp */
+typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
+
+/*! @brief      Reduction operation selector
+    @details    Enumeration used to specify the various reduction operations
+                ncclNumOps is the number of built-in ncclRedOp_t values and serves as
+                the least possible value for dynamic ncclRedOp_t values constructed by
+                ncclRedOpCreate functions.
+
+                ncclMaxRedOp is the largest valid value for ncclRedOp_t and is defined
+                to be the largest signed value (since compilers are permitted to use
+                signed enums) that won't grow sizeof(ncclRedOp_t) when compared to previous
+                RCCL versions to maintain ABI compatibility. */
+typedef enum { ncclSum        = 0, /*!< Sum */
+               ncclProd       = 1, /*!< Product */
+               ncclMax        = 2, /*!< Max */
+               ncclMin        = 3, /*!< Min */
+               ncclAvg        = 4, /*!< Average */
+               ncclNumOps     = 5, /*!< Number of built-in reduction ops */
+               ncclMaxRedOp   = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t)) /*!< Largest value for ncclRedOp_t */
+             } ncclRedOp_t;
+
+/*! @brief      Data types
+    @details    Enumeration of the various supported datatype */
+typedef enum { ncclInt8       = 0, ncclChar       = 0,
+               ncclUint8      = 1,
+               ncclInt32      = 2, ncclInt        = 2,
+               ncclUint32     = 3,
+               ncclInt64      = 4,
+               ncclUint64     = 5,
+               ncclFloat16    = 6, ncclHalf       = 6,
+               ncclFloat32    = 7, ncclFloat      = 7,
+               ncclFloat64    = 8, ncclDouble     = 8,
+               ncclBfloat16   = 9,
+               ncclNumTypes   = 10 } ncclDataType_t;
+/*! @} */
+
+/*! @defgroup   rccl_api_custom_redop Custom Reduction Operator
+    @details    API calls relating to creation/destroying custom reduction operator
+                that pre-multiplies local source arrays prior to reduction
+    @{ */
+
+/*! @brief      Location and dereferencing logic for scalar arguments.
+    @details    Enumeration specifying memory location of the scalar argument.
+                Based on where the value is stored, the argument will be dereferenced either
+                while the collective is running (if in device memory), or before the ncclRedOpCreate()
+                function returns (if in host memory). */
+typedef enum {
+  ncclScalarDevice        = 0, /*!< Scalar is in device-visible memory */
+  ncclScalarHostImmediate = 1  /*!< Scalar is in host-visible memory */
+} ncclScalarResidence_t;
+
+/*! @brief      Create a custom pre-multiplier reduction operator
+    @details    Creates a new reduction operator which pre-multiplies input values by a given
+                scalar locally before reducing them with peer values via summation. For use
+                only with collectives launched against *comm* and *datatype*. The
+                *residence* argument indicates how/when the memory pointed to by *scalar*
+                will be dereferenced. Upon return, the newly created operator's handle
+                is stored in *op*.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[out] op            Pointer to where newly created custom reduction operator is to be stored
+    @param[in]  scalar        Pointer to scalar value.
+    @param[in]  datatype      Scalar value datatype
+    @param[in]  residence     Memory type of the scalar value
+    @param[in]  comm          Communicator to associate with this custom reduction operator */
+ncclResult_t  ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
+/*! @cond       include_hidden */
+ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
+/*! @endcond */
+
+/*! @brief      Destroy custom reduction operator
+    @details    Destroys the reduction operator *op*. The operator must have been created by
+                ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
+                destroyed as soon as the last RCCL function which is given that operator returns.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[in]  op            Custom reduction operator is to be destroyed
+    @param[in]  comm          Communicator associated with this reduction operator */
+ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
+/*! @cond       include_hidden */
+ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
+/*! @endcond */
+/*! @} */
+
+/*! @defgroup   rccl_collective_api Collective Communication Operations
+    @details    Collective communication operations must be called separately for each
+                communicator in a communicator clique.
+
+                They return when operations have been enqueued on the HIP stream.
+                Since they may perform inter-CPU synchronization, each call has to be done
+                from a different thread or process, or need to use Group Semantics (see
+                below).
+    @{ */
+
+/*! @brief      Reduce
+    @details    Reduces data arrays of length *count* in *sendbuff* into *recvbuff* using *op*
+                operation.
+                *recvbuff* may be NULL on all calls except for root device.
+                *root* is the rank (not the HIP device) where data will reside after the
+                 operation is complete.
+                In-place operation will happen if sendbuff == recvbuff.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[in]  sendbuff      Local device data buffer to be reduced
+    @param[out] recvbuff      Data buffer where result is stored (only for *root* rank).  May be null for other ranks.
+    @param[in]  count         Number of elements in every send buffer
+    @param[in]  datatype      Data buffer element datatype
+    @param[in]  op            Reduction operator type
+    @param[in]  root          Rank where result data array will be stored
+    @param[in]  comm          Communicator group object to execute on
+    @param[in]  stream        HIP stream to execute collective on */
+ncclResult_t  ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+/*! @cond       include_hidden */
+ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+/*! @endcond */
+
+/*! @brief      (Deprecated) Broadcast (in-place)
+    @details    Copies *count* values from *root* to all other devices.
+                root is the rank (not the CUDA device) where data resides before the
+                operation is started.
+                This operation is implicitly in-place.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[in,out]  buff      Input array on *root* to be copied to other ranks.  Output array for all ranks.
+    @param[in]  count         Number of elements in data buffer
+    @param[in]  datatype      Data buffer element datatype
+    @param[in]  root          Rank owning buffer to be copied to others
+    @param[in]  comm          Communicator group object to execute on
+    @param[in]  stream        HIP stream to execute collective on */
+ncclResult_t  ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+/*! @cond       include_hidden */
+ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+/*! @endcond */
+
+/*! @brief      Broadcast
+    @details    Copies *count* values from *sendbuff* on *root* to *recvbuff* on all devices.
+                *root* is the rank (not the HIP device) where data resides before the operation is started.
+                *sendbuff* may be NULL on ranks other than *root*.
+                In-place operation will happen if *sendbuff* == *recvbuff*.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[in]  sendbuff      Data array to copy (if *root*).  May be NULL for other ranks
+    @param[in]  recvbuff      Data array to store received array
+    @param[in]  count         Number of elements in data buffer
+    @param[in]  datatype      Data buffer element datatype
+    @param[in]  root          Rank of broadcast root
+    @param[in]  comm          Communicator group object to execute on
+    @param[in]  stream        HIP stream to execute collective on */
+ncclResult_t  ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+/*! @cond       include_hidden */
+ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+/*! @endcond */
+
+/*! @brief      All-Reduce
+    @details    Reduces data arrays of length *count* in *sendbuff* using *op* operation, and
+                leaves identical copies of result on each *recvbuff*.
+                In-place operation will happen if sendbuff == recvbuff.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[in]  sendbuff      Input data array to reduce
+    @param[out] recvbuff      Data array to store reduced result array
+    @param[in]  count         Number of elements in data buffer
+    @param[in]  datatype      Data buffer element datatype
+    @param[in]  op            Reduction operator
+    @param[in]  comm          Communicator group object to execute on
+    @param[in]  stream        HIP stream to execute collective on */
+ncclResult_t  ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+/*! @cond       include_hidden */
+ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+/*! @endcond */
+
+/*! @brief      Reduce-Scatter
+    @details    Reduces data in *sendbuff* using *op* operation and leaves reduced result
+                scattered over the devices so that *recvbuff* on rank i will contain the i-th
+                block of the result.
+                Assumes sendcount is equal to nranks*recvcount, which means that *sendbuff*
+                should have a size of at least nranks*recvcount elements.
+                In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[in]  sendbuff      Input data array to reduce
+    @param[out] recvbuff      Data array to store reduced result subarray
+    @param[in]  recvcount     Number of elements each rank receives
+    @param[in]  datatype      Data buffer element datatype
+    @param[in]  op            Reduction operator
+    @param[in]  comm          Communicator group object to execute on
+    @param[in]  stream        HIP stream to execute collective on */
+ncclResult_t  ncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream);
+/*! @cond       include_hidden */
+ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream);
+/*! @endcond */
+
+/*! @brief      All-Gather
+    @details    Each device gathers *sendcount* values from other GPUs into *recvbuff*,
+                receiving data from rank i at offset i*sendcount.
+                Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
+                should have a size of at least nranks*sendcount elements.
+                In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[in]  sendbuff      Input data array to send
+    @param[out] recvbuff      Data array to store the gathered result
+    @param[in]  sendcount     Number of elements each rank sends
+    @param[in]  datatype      Data buffer element datatype
+    @param[in]  comm          Communicator group object to execute on
+    @param[in]  stream        HIP stream to execute collective on */
+ncclResult_t  ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+/*! @cond       include_hidden */
+ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+/*! @endcond */
+
+/*! @brief      Send
+    @details    Send data from *sendbuff* to rank *peer*.
+                Rank *peer* needs to call ncclRecv with the same *datatype* and the same *count*
+                as this rank.
+                This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
+                need to progress concurrently to complete, they must be fused within a ncclGroupStart /
+                ncclGroupEnd section.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[in]  sendbuff      Data array to send
+    @param[in]  count         Number of elements to send
+    @param[in]  datatype      Data buffer element datatype
+    @param[in]  peer          Peer rank to send to
+    @param[in]  comm          Communicator group object to execute on
+    @param[in]  stream        HIP stream to execute collective on */
+ncclResult_t  ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+/*! @cond       include_hidden */
+ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+/*! @endcond */
+
+/*! @brief      Receive
+    @details    Receive data from rank *peer* into *recvbuff*.
+                Rank *peer* needs to call ncclSend with the same datatype and the same count
+                as this rank.
+                This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
+                need to progress concurrently to complete, they must be fused within a ncclGroupStart/
+                ncclGroupEnd section.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[out] recvbuff      Data array to receive
+    @param[in]  count         Number of elements to receive
+    @param[in]  datatype      Data buffer element datatype
+    @param[in]  peer          Peer rank to send to
+    @param[in]  comm          Communicator group object to execute on
+    @param[in]  stream        HIP stream to execute collective on */
+ncclResult_t  ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+/*! @cond       include_hidden */
+ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+/*! @endcond */
+
+/*! @brief      Gather
+    @details    Root device gathers *sendcount* values from other GPUs into *recvbuff*,
+                receiving data from rank i at offset i*sendcount.
+                Assumes recvcount is equal to nranks*sendcount, which means that *recvbuff*
+                should have a size of at least nranks*sendcount elements.
+                In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
+                *recvbuff* may be NULL on ranks other than *root*.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[in]  sendbuff      Data array to send
+    @param[out] recvbuff      Data array to receive into on *root*.
+    @param[in]  sendcount     Number of elements to send per rank
+    @param[in]  datatype      Data buffer element datatype
+    @param[in]  root          Rank that receives data from all other ranks
+    @param[in]  comm          Communicator group object to execute on
+    @param[in]  stream        HIP stream to execute collective on */
+ncclResult_t  ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream);
+/*! @cond       include_hidden */
+ncclResult_t pncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream);
+/*! @endcond */
+
+/*! @brief      Scatter
+    @details    Scattered over the devices so that recvbuff on rank i will contain the i-th
+                block of the data on root.
+                Assumes sendcount is equal to nranks*recvcount, which means that *sendbuff*
+                should have a size of at least nranks*recvcount elements.
+                In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[in]  sendbuff      Data array to send (on *root* rank).  May be NULL on other ranks.
+    @param[out] recvbuff      Data array to receive partial subarray into
+    @param[in]  recvcount     Number of elements to receive per rank
+    @param[in]  datatype      Data buffer element datatype
+    @param[in]  root          Rank that scatters data to all other ranks
+    @param[in]  comm          Communicator group object to execute on
+    @param[in]  stream        HIP stream to execute collective on */
+ncclResult_t  ncclScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, int root, ncclComm_t comm,
+    cudaStream_t stream);
+/*! @cond       include_hidden */
+ncclResult_t pncclScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, int root, ncclComm_t comm,
+    cudaStream_t stream);
+/*! @endcond */
+
+/*! @brief      All-To-All
+    @details    Device (i) send (j)th block of data to device (j) and be placed as (i)th
+                block. Each block for sending/receiving has *count* elements, which means
+                that *recvbuff* and *sendbuff* should have a size of nranks*count elements.
+                In-place operation is NOT supported. It is the user's responsibility
+                to ensure that sendbuff and recvbuff are distinct.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[in]  sendbuff      Data array to send (contains blocks for each other rank)
+    @param[out] recvbuff      Data array to receive (contains blocks from each other rank)
+    @param[in]  count         Number of elements to send between each pair of ranks
+    @param[in]  datatype      Data buffer element datatype
+    @param[in]  comm          Communicator group object to execute on
+    @param[in]  stream        HIP stream to execute collective on */
+ncclResult_t  ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+/*! @cond       include_hidden */
+ncclResult_t pncclAllToAll(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+/*! @endcond */
+
+/*! @brief      All-To-Allv
+    @details    Device (i) sends sendcounts[j] of data from offset sdispls[j]
+                to device (j). At the same time, device (i) receives recvcounts[j] of data
+                from device (j) to be placed at rdispls[j].
+                sendcounts, sdispls, recvcounts and rdispls are all measured in the units
+                of datatype, not bytes.
+                In-place operation will happen if sendbuff == recvbuff.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[in]  sendbuff      Data array to send (contains blocks for each other rank)
+    @param[in]  sendcounts    Array containing number of elements to send to each participating rank
+    @param[in]  sdispls       Array of offsets into *sendbuff* for each participating rank
+    @param[out] recvbuff      Data array to receive (contains blocks from each other rank)
+    @param[in]  recvcounts    Array containing number of elements to receive from each participating rank
+    @param[in]  rdispls       Array of offsets into *recvbuff* for each participating rank
+    @param[in]  datatype      Data buffer element datatype
+    @param[in]  comm          Communicator group object to execute on
+    @param[in]  stream        HIP stream to execute collective on */
+ncclResult_t  ncclAllToAllv(const void *sendbuff, const size_t sendcounts[],
+    const size_t sdispls[], void *recvbuff, const size_t recvcounts[],
+    const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+/*! @cond       include_hidden */
+ncclResult_t pncclAllToAllv(const void *sendbuff, const size_t sendcounts[],
+    const size_t sdispls[], void *recvbuff, const size_t recvcounts[],
+    const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+/*! @endcond */
+
+/*! @} */
+
+/*! @defgroup   msccl_api MSCCL Algorithm
+    @details    API calls relating to the optional MSCCL algorithm datapath
+    @{ */
+
+/*! @brief      Opaque handle to MSCCL algorithm */
+typedef int mscclAlgoHandle_t;
+
+/*! @brief      MSCCL Load Algorithm
+    @details    Load MSCCL algorithm file specified in mscclAlgoFilePath and return
+                its handle via mscclAlgoHandle. This API is expected to be called by MSCCL
+                scheduler instead of end users.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[in]  mscclAlgoFilePath  Path to MSCCL algorithm file
+    @param[out] mscclAlgoHandle    Returned handle to MSCCL algorithm
+    @param[in]  rank               Current rank */
+ncclResult_t  mscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank);
+/*! @cond       include_hidden */
+ncclResult_t pmscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank);
+/*! @endcond */
+
+/*! @brief      MSCCL Run Algorithm
+    @details    Run MSCCL algorithm specified by mscclAlgoHandle. The parameter
+                list merges all possible parameters required by different operations as this
+                is a general-purposed API. This API is expected to be called by MSCCL
+                scheduler instead of end users.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[in]  sendBuff         Data array to send
+    @param[in]  sendCounts       Array containing number of elements to send to each participating rank
+    @param[in]  sDisPls          Array of offsets into *sendbuff* for each participating rank
+    @param[out] recvBuff         Data array to receive
+    @param[in]  recvCounts       Array containing number of elements to receive from each participating rank
+    @param[in]  rDisPls          Array of offsets into *recvbuff* for each participating rank
+    @param[in]  count            Number of elements
+    @param[in]  dataType         Data buffer element datatype
+    @param[in]  root             Root rank index
+    @param[in]  peer             Peer rank index
+    @param[in]  op               Reduction operator
+    @param[in]  mscclAlgoHandle  Handle to MSCCL algorithm
+    @param[in]  comm             Communicator group object to execute on
+    @param[in]  stream           HIP stream to execute collective on */
+ncclResult_t  mscclRunAlgo(
+    const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
+    void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
+    size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
+    mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, cudaStream_t stream);
+/*! @cond       include_hidden */
+ncclResult_t pmscclRunAlgo(
+    const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
+    void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
+    size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
+    mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, cudaStream_t stream);
+/*! @endcond */
+
+/*! @brief      MSCCL Unload Algorithm
+    @details    Unload MSCCL algorithm previous loaded using its handle. This API
+                is expected to be called by MSCCL scheduler instead of end users.
+    @return     Result code. See @ref rccl_result_code for more details.
+
+    @param[in]  mscclAlgoHandle  Handle to MSCCL algorithm to unload
+*/
+ncclResult_t  mscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle);
+/*! @cond       include_hidden */
+ncclResult_t pmscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle);
+/*! @endcond */
+/*! @} */
+
+
+/*! @defgroup   rccl_group_api Group semantics
+    @details    When managing multiple GPUs from a single thread, and since RCCL collective
+                calls may perform inter-CPU synchronization, we need to "group" calls for
+                different ranks/devices into a single call.
+
+                Grouping RCCL calls as being part of the same collective operation is done
+                using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
+                collective calls until the ncclGroupEnd call, which will wait for all calls
+                to be complete. Note that for collective communication, ncclGroupEnd only
+                guarantees that the operations are enqueued on the streams, not that
+                the operation is effectively done.
+
+                Both collective communication and ncclCommInitRank can be used in conjunction
+                of ncclGroupStart/ncclGroupEnd, but not together.
+
+                Group semantics also allow to fuse multiple operations on the same device
+                to improve performance (for aggregated collective calls), or to permit
+                concurrent progress of multiple send/receive operations.
+    @{ */
+
+/*! @brief      Group Start
+    @details    Start a group call. All calls to RCCL until ncclGroupEnd will be fused into
+                a single RCCL operation. Nothing will be started on the HIP stream until
+                ncclGroupEnd.
+    @return     Result code. See @ref rccl_result_code for more details. */
+ncclResult_t  ncclGroupStart();
+/*! @cond       include_hidden */
+ncclResult_t pncclGroupStart();
+/*! @endcond */
+
+/*! @brief      Group End
+    @details    End a group call. Start a fused RCCL operation consisting of all calls since
+                ncclGroupStart. Operations on the HIP stream depending on the RCCL operations
+                need to be called after ncclGroupEnd.
+    @return     Result code. See @ref rccl_result_code for more details. */
+ncclResult_t  ncclGroupEnd();
+/*! @cond       include_hidden */
+ncclResult_t pncclGroupEnd();
+/*! @endcond */
+/*! @} */
+
+#ifdef __cplusplus
+} // end extern "C"
+#endif
+
+#endif // end include guard
--- a/README.md
+++ b/README.md
@@ -67,7 +67,9 @@ yum install rapidjson
 export NCCL_LIB_DIR=/opt/dtk/cuda/lib64
 pip3 install -r requirements.txt
 pip3 install urllib3==1.24
-#apt-get 换源，添加清华源
+apt-get install rapidjson-dev
+
+#若安装不上则需要apt-get 换源，添加清华源
 #添加清华源后更新
 #vim /etc/apt/sources.list
 #添加清华源如下：
@@ -75,17 +77,17 @@ pip3 install urllib3==1.24
 #deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-updates main restricted universe multiverse
 #deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-backports main restricted universe multiverse
 #deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-security main restricted universe multiverse
-#换源完成后进行更新
-sudo apt-get update
-apt-get install rapidjson-dev
+#换源完成后进行更新再重新安装
+#sudo apt-get update

 # 执行nccl环境变量
 export NCCL_LAUNCH_MODE=GROUP
 ```
 注：

-1、docker启动  -v /opt/hyhal:/opt/hyhal  这个变量不能少                        
-2、gpufusion wget指令提供的网址可能会有变化，可以进入提供网页下载对应压缩工具包                 
+1、docker启动  -v /opt/hyhal:/opt/hyhal  这个变量不能少                         
+2、gpufusion wget指令提供的网址可能会有变化，可以进入提供网页下载对应压缩工具包                   
+3、若使用DTK24041 pytorch镜像中进行编译，其中镜像中dtk自带有gpufusion文件，在此项目编译过程总需要更换其中一个文件，lmdeploy/3rdparty/gpufusion/nccl.h 放入 /opt/dtk/cuda/include 路径下                                

 #### 源码编译安装
 - 代码下载
@@ -113,13 +115,16 @@ cd dist && pip3 install lmdeploy*

 ### 模型转换
 ```bash
-# <model_name> 模型的名字 （'llama', 'internlm', 'vicuna', 'wizardlM', 'internlm-chat-7b', 'internlm-chat', 'internlm-chat-7b-8k', 'internlm-chat-20b', 'internlm-20b', 'baichuan-7b', 'baichuan2-7b', 'puyu', 'llama2', 'qwen-7b', 'qwen-14b', 'qwen-72b', 'codellama', 'solar', 'ultralm', 'ultracm', 'yi'）
+# <model_name> 模型的名字 （['base', 'llama', 'vicuna', 'wizardlm', 'internlm', 'internlm-cha-chat-7b-8k', 'internlm-chat-20b', 'internlm-20b', 'internlm2-1_8b', 'internlm2-7b', 'internlm2-20b', 'internlm2', 'internlm2-chat', 'internlm2-cinternlm2-chat-20b', 'baichuan-base', 'baichuan-7b', 'baichuan2', 'baichuan2-7b', 'puyu', 'llama2', 'llama-2', 'llama-2-chat', 'qwen', 'qwen-7b',ma', 'falcon', 'chatglm', 'chatglm2-6b', 'solar', 'solar-70b', 'ultralm', 'ultracm', 'yi', 'yi-chat', 'yi-200k', 'yi-34b', 'Mistral-7B-Instruct',l', 'mixtral', 'gemma', 'deepseek', 'deepseek-chat', 'yi-vl']）
 # <model_path> 模型路径
-# <model_format> 模型的格式 （'llama', 'hf'， None。可以不写默认None,代码会根据模型选择格式,一般选择不写）
-# <model_format> 保存输出的目标路径（默认./workspace）
+# <model_format> 模型的格式 （'awq', 'hf'，'llama'）
+# <dst_path> 保存输出的目标路径（默认./workspace）
 # <tp> 用于张量并行的GPU数量应该是2^n
-
+# <quant_model_path> AWQ量化模型
+#若采用fp16模型
 lmdeploy convert ${model_name} ${model_path} --model-format ${model_format} --dst-path ${dst_path} --tp ${tp}
+#若采用AWQ模型
+lmdeploy convert ${model_name} ${quant_model_path} --model-format awq --group-size 128 --tp ${tp} --dst-path ${dst_path}
 ```
 ### 运行
 #### bash界面运行
@@ -155,21 +160,41 @@ api-server的详细使用可以参照[这里](docs/zh_cn/serving)的文档

 codellama模型的部署可以参照[codellama](docs/zh_cn/supported_models/codellama.md)

-### AWQ 量化推理
+## AWQ 量化推理
 本版本支持量化推理功能，步骤如下：
 ```bash
-#group_size:按照模型量化时候的分组参数，一般为128
+#采用数据量化
+#可以根据需求采用需要的数据集进行量化，以下以c4作为数据集进行量化示例
+#修改lmdeploy/lmdeploy/lite/utils/calib_dataloader.py get_c4()函数，更改为本地数据集路径   
+lmdeploy lite auto_awq ${model_path} --calib-dataset 'c4' --calib-samples 128 --calib-seqlen 2048 --w-bits 4  --w-group-size 128 --work-dir ${quant_model_path}
+
+#group_size:按照模型量化时候的分组参数，仅支持128
 #<tp> 用于张量并行的GPU数量应该是2^n 
 #<dst-path> 保存模型的目标文件夹
 #step1:模型转换：
-lmdeploy convert ${model_name} ${model_path} --model_format awq --group-size ${group_size} --tp ${tp} --dst-path ${dst_path}
+lmdeploy convert ${model_name} ${quant_model_path} --model-format awq --group-size 128 --tp ${tp} --dst-path ${dst_path}
 #step1:模型运行
 lmdeploy chat turbomind ${dst_path} --tp ${tp}
 ```
 注意事项：              
-1.该版本暂时仅支持tp=1 单卡量化推理，仅支持卡型KM-AI，暂不支持K100/Z100/Z100L；             
-2.该版本量化推理功能仅支持先通过convert模型转换为turbomind格式，然后进行推理运行，暂时不知道hf模型直接量化推理；               
-3.该版本暂时不支持通过数据集进行量化功能，需要在别处获取量化模型；                       
+1.该版本仅支持卡型KM-AI，暂不支持K100/Z100/Z100L；         
+2.在进行benchmark测评时，AWQ模型不支持使用hf 模型直接进行评测，推荐先使用工具将量化模型转换为turbomind格式,且执行的tp数据需和模型转换时的tp指定数量一致；              
+3.llama2-70b与qwen-72b模型在做数据集量化时，calib-samples参数推荐设置为120；                   
+4.多卡支持模型列表如下：
+
+|     模型     | AWQ TP=1 | AWQ TP=2 | AWQ TP=4 
+| :----------: | :------: | :--: | :--: |
+| Llama2-7B-chat  |   Yes  | Yes  | No  |
+| Llama2-13B-chat |   Yes  | Yes  | Yes |
+| Llama2-70B-chat |   Yes  | Yes  | Yes |
+| qwen-7B-chat  |   Yes  | Yes  | No  |
+| qwen-14B-chat |   Yes  | No   | No  |
+| qwen-72B-chat |   Yes  | Yes  | Yes |
+
+
+备注:qwen-14b-chat模型不支持多卡AWQ量化推理原因为其中有size为[13696,5120]的gemm,当group_size为128时，scale shape为[13696/128,5120]=[107,5120],107不能被tp=2或者4整除。您可以依据此特点来判断您的模型能都支持AWQ多卡推理。                                   
+             
+

 ## result
 ![qwen推理](docs/dcu/interlm.gif)

--- a/lmdeploy/cli/cli.py
+++ b/lmdeploy/cli/cli.py
@@ -66,6 +66,11 @@ class CLI(object):
            type=int,
            default=2,
            help='A parameter used in AWQ to control the layout of weight ')
+        parser.add_argument(
+            '--w4-pad-size',
+            type=int,
+            default=2,
+            help='A parameter used in AWQ to control the pad size of weight ')        
        parser.set_defaults(run=CLI.convert)

    @staticmethod

--- a/lmdeploy/turbomind/deploy/converter.py
+++ b/lmdeploy/turbomind/deploy/converter.py
@@ -197,6 +197,7 @@ def main(model_name: str,
         quant_path: str = None,
         group_size: int = 0,
         w4_weight_layout: int = 2,
+         w4_pad_size:int =2,
         **kwargs):
    """deploy llama family models via turbomind.

@@ -217,6 +218,7 @@ def main(model_name: str,
        group_size (int): a parameter used in AWQ to quantize fp16 weights
            to 4 bits
        w4_weight_layout (int) :a parameter used in AWQ to control the layout of weight
+        w4_pad_size(int): a parameter used in AWQ to control the layout of weight
        kwargs (dict): other params for convert
    """

@@ -263,12 +265,15 @@ def main(model_name: str,
    cfg.rotary_embedding = cfg.size_per_head
    cfg.group_size = group_size
    cfg.w4_weight_layout=w4_weight_layout
+    cfg.w4_pad_size =w4_pad_size
+    
    if inferred_model_format.find('awq') != -1:
        cfg.weight_type = 'int4'
        output_format = 'w4'
        assert group_size > 0, f'group_size: {group_size} should > 0'
-        print("w4_weight_layout:",w4_weight_layout)
+        #print("w4_weight_layout:",w4_weight_layout)
        assert w4_weight_layout>=0 and w4_weight_layout<3,f'w4_weight_layout: {w4_weight_layout} should >= 0 and < 3'
+        assert w4_pad_size>=0 and w4_pad_size<5 ,f'w4_pad_size should >= 0 and <5'
    else:
        #output_format = update_output_format(model_name, inferred_model_format,
        #                                     model_path, output_format)

--- a/lmdeploy/turbomind/deploy/target_model/base.py
+++ b/lmdeploy/turbomind/deploy/target_model/base.py
@@ -54,6 +54,7 @@ class TurbomindModelConfig:
    size_per_head: int = 128
    group_size: int = 0
    w4_weight_layout : int = 2
+    w4_pad_size: int =2
    max_batch_size: int = 64
    max_context_token_num: int = 1
    step_length: int = 1
@@ -208,6 +209,7 @@ class BaseOutputModel(ABC):
                param = param.to(torch_type)
            tprint(name, param.shape)
            _tofile(param, osp.join(self.out_dir, name))
+  
        elif len(self.tm_params) > 0:
            tm_params = self.tm_params
            weight_type = self.cfg.weight_type
@@ -228,6 +230,7 @@ class BaseOutputModel(ABC):
                    torch_tensor = torch_tensor.float()
            for tm_tensor in tm_params[name]:
                tm_tensor.copy_from(torch_tensor)
+                
            tm_params.pop(name)
        else:
            tprint('skip export', name, param.shape)
@@ -325,10 +328,6 @@ def permute(x: torch.Tensor, size_per_head: int = 128):
        return x.view(n_heads, 2, dim // n_heads // 2,
                      1).transpose(1, 2).reshape(dim, 1)

-def permute_trans(x: torch.Tensor):
-    if x.shape[-1]>1:
-        dim = x.shape[-1]
-        return x.view(-1, x.shape[-1]).transpose(0, 1).reshape(dim,-1)

 def merge_qkv(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, tp: int,
              dim: int):

--- a/lmdeploy/turbomind/deploy/target_model/w4.py
+++ b/lmdeploy/turbomind/deploy/target_model/w4.py
@@ -8,7 +8,7 @@ import lmdeploy

 from ..source_model.base import BaseInputModel, BaseReader
 from .base import (OUTPUT_MODELS, BaseOutputModel, TurbomindModelConfig,
-                   merge_qkv, permute,permute_trans)
+                   merge_qkv, permute)

 # import _turbomind as _tm
 # TODO: find another way import _turbomind
@@ -117,6 +117,7 @@ class TurbomindW4Model(BaseOutputModel):
        group_size = self.cfg.group_size
        tp = self.cfg.tensor_para_size
        w4_weight_layout = self.cfg.w4_weight_layout
+        w4_pad_size = self.cfg.w4_pad_size
        size_per_head = self.cfg.size_per_head
        # attn
        q_qw, k_qw, v_qw, o_qw = get_cuda_tensor(bin.attn(i))
@@ -134,48 +135,15 @@ class TurbomindW4Model(BaseOutputModel):
        qkv_qz = merge_qkv(q_qz, k_qz, v_qz, tp, dim=2)
        qkv_s = merge_qkv(q_s, k_s, v_s, tp, dim=2)

-        pad_group_count=2
-        
-        if w4_weight_layout==1 or w4_weight_layout==2:
-            if qkv_qw.shape[0]%4096==0:       
-                qkv_qw_padding=torch.zeros(group_size*pad_group_count,qkv_qw.shape[1],dtype=torch.int32).cuda()
-                qkv_qw =torch.cat((qkv_qw,qkv_qw_padding),dim=0).contiguous()
-                
-                qkv_qz_padding =torch.zeros(pad_group_count,qkv_qz.shape[1],dtype=torch.int32).cuda()
-                qkv_qz =torch.cat((qkv_qz,qkv_qz_padding),dim=0).contiguous()  
-                
-                qkv_s_padding =torch.zeros(pad_group_count,qkv_s.shape[1],dtype=torch.float16).cuda()
-                qkv_s =torch.cat((qkv_s,qkv_s_padding),dim=0).contiguous() 
-                
-            qkv_qw, qkv_sz = convert_s4_(qkv_qw, qkv_qz, qkv_s, group_size)
-            qkv_qw = tp_m_s4(qkv_qw, tp)  
-            qkv_sz = permute_trans(qkv_sz)
-        else:
-            qkv_qw, qkv_sz = convert_s4(qkv_qw, qkv_qz, qkv_s, group_size)
-            qkv_qw = tp_m_s4(qkv_qw, tp) 
-            #print("请设置weight layout\n")
-            
+        qkv_qw, qkv_sz = convert_s4(qkv_qw, qkv_qz, qkv_s, group_size)
+  
        self.save_split(qkv_qw, f'layers.{i}.attention.w_qkv.qweight', -1)
        self.save_split(qkv_sz, f'layers.{i}.attention.w_qkv.scales_zeros', -1)

-        if w4_weight_layout==1 or w4_weight_layout==2:
-            if o_qw.shape[0]%4096==0:
-                o_qw_padding=torch.zeros(group_size*pad_group_count,o_qw.shape[1],dtype=torch.int32).cuda()
-                o_qw =torch.cat((o_qw,o_qw_padding),dim=0).contiguous()
-                
-                o_qz_padding =torch.zeros(pad_group_count,o_qz.shape[1],dtype=torch.int32).cuda()
-                o_qz =torch.cat((o_qz,o_qz_padding),dim=0).contiguous()  
-                
-                o_s_padding =torch.zeros(pad_group_count,o_s.shape[1],dtype=torch.float16).cuda()
-                o_s =torch.cat((o_s,o_s_padding),dim=0).contiguous() 
-                
-            o_qw, o_sz = convert_s4_(o_qw, o_qz, o_s, group_size)
-            o_sz = permute_trans(o_sz)
-        else:
-            o_qw, o_sz = convert_s4(o_qw, o_qz, o_s, group_size)
+        o_qw, o_sz = convert_s4(o_qw, o_qz, o_s, group_size)
        self.save_split(o_qw, f'layers.{i}.attention.wo.qweight', 0)
        self.save_split(o_sz, f'layers.{i}.attention.wo.scales_zeros', 0)
-
+        
        q_b, k_b, v_b, o_b = get_cuda_tensor(bin.attn_bias(i))
        if q_b is not None:
            q_b = permute(q_b, size_per_head)
@@ -184,6 +152,7 @@ class TurbomindW4Model(BaseOutputModel):
            self.save_split(qkv_b, f'layers.{i}.attention.w_qkv.bias', -1)
            self.save_split(o_b, f'layers.{i}.attention.wo.bias', copy=True)

+
        # ffn weights
        w1_qw, w2_qw, w3_qw = get_cuda_tensor(bin.ffn(i))
        w1_qz, w2_qz, w3_qz = get_cuda_tensor(bin.ffn_zero(i))
@@ -191,45 +160,12 @@ class TurbomindW4Model(BaseOutputModel):

        w13_qw, w13_qz, w13_s = fuse_w1_w3_s4(w1_qw, w1_qz, w1_s, w3_qw, w3_qz,
                                              w3_s)
-        if w4_weight_layout==1 or w4_weight_layout==2:
-            if w13_qw.shape[0]%4096==0:
-                w13_qw_padding=torch.zeros(group_size*pad_group_count,w13_qw.shape[1],dtype=torch.int32).cuda()
-                w13_qw =torch.cat((w13_qw,w13_qw_padding),dim=0).contiguous()
-                
-                w13_qz_padding =torch.zeros(pad_group_count,w13_qz.shape[1],dtype=torch.int32).cuda()
-                w13_qz =torch.cat((w13_qz,w13_qz_padding),dim=0).contiguous()  
-                
-                w13_s_padding =torch.zeros(pad_group_count,w13_s.shape[1],dtype=torch.float16).cuda()
-                w13_s =torch.cat((w13_s,w13_s_padding),dim=0).contiguous() 
-                
-            w13_qw, w13_sz = convert_s4_(w13_qw, w13_qz, w13_s, group_size)
-            w13_qw = tp_m_s4(w13_qw, tp)
-            
-            w13_sz = permute_trans(w13_sz)
-        else:
-            w13_qw, w13_sz = convert_s4(w13_qw, w13_qz, w13_s, group_size)
-            w13_qw = tp_m_s4(w13_qw, tp)
+        w13_qw, w13_sz = convert_s4(w13_qw, w13_qz, w13_s, group_size)
        self.save_split(w13_qw, f'layers.{i}.feed_forward.w13.qweight', -1)
-        self.save_split(w13_sz, f'layers.{i}.feed_forward.w13.scales_zeros',
-                        -1)
-
-        if w4_weight_layout==1 or w4_weight_layout==2:
-            #pading
-            if w2_qw.shape[0]%4096==0:
-                w2_qw_padding=torch.zeros(group_size*pad_group_count,w2_qw.shape[1],dtype=torch.int32).cuda()
-                w2_qw =torch.cat((w2_qw,w2_qw_padding),dim=0).contiguous()
-                
-                w2_qz_padding =torch.zeros(pad_group_count,w2_qz.shape[1],dtype=torch.int32).cuda()
-                w2_qz =torch.cat((w2_qz,w2_qz_padding),dim=0).contiguous()  
-                
-                w2_s_padding =torch.zeros(pad_group_count,w2_s.shape[1],dtype=torch.float16).cuda()
-                w2_s =torch.cat((w2_s,w2_s_padding),dim=0).contiguous() 
-                
-            w2_qw, w2_sz = convert_s4_(w2_qw, w2_qz, w2_s, group_size)
-            w2_sz = permute_trans(w2_sz)
-        else:
-            w2_qw, w2_sz = convert_s4(w2_qw, w2_qz, w2_s, group_size)
-            
+        self.save_split(w13_sz, f'layers.{i}.feed_forward.w13.scales_zeros',-1)
+
+        w2_qw, w2_sz = convert_s4(w2_qw, w2_qz, w2_s, group_size)
+        
        self.save_split(w2_qw, f'layers.{i}.feed_forward.w2.qweight', 0)
        self.save_split(w2_sz, f'layers.{i}.feed_forward.w2.scales_zeros', 0)


--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -148,6 +148,7 @@ class TurboMind:
                 model_format: Optional[str] = None,
                 group_size: Optional[int] = None,
                 w4_weight_layout: Optional[int] = None,
+                 w4_pad_size: Optional[int] = None,
                 tp: Optional[int] = None,
                 chat_template_config: Optional[ChatTemplateConfig] = None,
                 **kwargs):
@@ -181,6 +182,7 @@ class TurboMind:
                                                  model_format=model_format,
                                                  group_size=group_size,
                                                  w4_weight_layout=w4_weight_layout,
+                                                  w4_pad_size=w4_pad_size,
                                                  tp=tp,
                                                  **kwargs)

@@ -237,6 +239,28 @@ class TurboMind:
            threads.append(t)
        for t in threads:
            t.join()
+            
+    def _modify_weight(self, model_comm):
+        """modify weight if from_hf with awq."""
+        # TODO: support mpi
+        self.node_id = 0
+        self.node_num = 1
+        self.nccl_params = model_comm.create_nccl_params(self.node_id)
+        torch.cuda.synchronize()
+
+        def _modify_weight_func(device_id):
+            with cuda_ctx(device_id):
+                rank = self.node_id * self.gpu_count + device_id
+                model_comm.modify_shared_weights(device_id, rank)
+
+        threads = []
+        for device_id in range(self.gpu_count):
+            t = Thread(target=_modify_weight_func, args=(device_id, ))
+            t.start()
+            threads.append(t)
+        for t in threads:
+            t.join()     
+        

    def _load_kv_qparams(self, model_path, tm_params, **kwargs):
        """Load kv qparams when loading from hf."""
@@ -271,10 +295,10 @@ class TurboMind:
            t.join()

        for _ in range(self.gpu_count):
-            tensor_map = que.get()
+            tensor_map = que.get()   
            for k, v in tensor_map.items():
                if k not in tm_params:
-                    tm_params[k] = []
+                    tm_params[k] = []           
                tm_params[k].append(v)

    def _from_hf(self, model_source: ModelSource, model_path: str,
@@ -307,10 +331,11 @@ class TurboMind:
            data_type = 'int4'
            cfg.group_size = 128
            cfg.w4_weight_layout=2
+            cfg.w4_pad_size=0
        else:
-            # output_format = update_output_format(cfg.model_name,
-            #                                      inferred_model_format,
-            #                                      model_path, output_format)
+            output_format = update_output_format(cfg.model_name,
+                                                 inferred_model_format,
+                                                 model_path, output_format)
            data_type = output_format
            update_config_weight_type(output_format, cfg)

@@ -342,12 +367,15 @@ class TurboMind:
        # copy hf model weight to turbomind weight
        tm_params = output_model.tm_params
        self._get_model_params(model_comm, tm_params)
+
        logger.warning(f'get {len(tm_params)} model params')
        output_model.export()
-
+        self._modify_weight(model_comm)
+    
        # load kv qparams
        self._load_kv_qparams(model_path, tm_params, kv_sym=False, kv_bits=8)
        assert len(tm_params) == 0, f'missing {tm_params.keys()}'
+        

        return model_comm

@@ -381,7 +409,6 @@ class TurboMind:
        self.config = cfg
        self.model_name = cfg.model_name
        self.data_type = cfg.weight_type
-        #print("from_workspace_cfg:",cfg)

        # create model
        logger.warning(f'model_config:\n\n{cfg.toini()}')

--- a/lmdeploy/version.py
+++ b/lmdeploy/version.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Tuple
-__dcu_version__ = '0.2.6'
+__dcu_version__ = '0.2.6+das1.1.git7063377.abi0.dtk2404.torch2.1.0'
 __version__ = '0.2.6'
 short_version = __version__


--- a/src/turbomind/kernels/gemm_s_f16/format.cu
+++ b/src/turbomind/kernels/gemm_s_f16/format.cu
@@ -76,15 +76,75 @@ void reformat_s4_k_m8(uint32_t* dst, const uint32_t* src, int m, int k, cudaStre
    permute_u4<0, 1, 2, 3, 4, 5, 6, 7, 8, 9><<<512, 512, 0, st>>>(dst, src, shape);
 }

+template <typename T>
+void PrintData(cudaStream_t stream, const T* input,int size)
+{
+    int input_size=size;
+    T* h_data;
+    h_data=new T[input_size];
+    cudaMemcpy(h_data,input, input_size * sizeof(T), cudaMemcpyDeviceToHost);
+
+    if constexpr (std::is_same<T, half>::value)
+    {
+        for(int i=0;i<input_size;i++)
+        {
+            printf("%f ",__half2float(h_data[i]));
+        }
+    }
+    else if constexpr(std::is_same<T, half2>::value)
+    {
+        for(int i=0;i<input_size;i++)
+        {
+            printf("x:%f  y:%f ",__half2float(h_data[i].data[0]),__half2float(h_data[i].data[1]));
+        }
+    }
+    else if constexpr(std::is_same<T, uint32_t>::value)
+    {
+        for(int i=0;i<input_size;i++)
+        {
+            printf(" %u ",h_data[i]);
+        }
+    }
+    printf("\n");
+    delete[] h_data;
+}
+
+#define INSTANTIATEPRINTDATA(T)  \
+template void PrintData(cudaStream_t stream, const T* input,int size);
+
+INSTANTIATEPRINTDATA(__half)
+INSTANTIATEPRINTDATA(float)
+INSTANTIATEPRINTDATA(half2)
+INSTANTIATEPRINTDATA(uint32_t)
+
 void reformat_s4_k_m8_tarnsw4(uint32_t* dst, const uint32_t* src, int m, int k, cudaStream_t st)
 {
-    // permutation for [k, m/8] layout
    Array<int, 10> shape{1, k / 8, 2, 2, 2, 1, m / 8, 2, 2, 2};
    // 0123456-->4,6,7,5,0,3,1,2
    //permute_u4<4, 6, 7, 5, 0, 3, 1, 2><<<512, 512, 0, st>>>(dst, src, shape);
    permute_u4<5, 6, 8, 9, 7, 0, 1, 4, 2, 3><<<512, 512, 0, st>>>(dst, src, shape);
 }

+
+__global__ void permute_u32(int num_kernels, uint32_t* dst, const uint32_t* src, int m, int k)
+{
+    //[k,m]-->[m,k]
+    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    if(id >= num_kernels) return;
+    int j=id%k;
+    int i=id/k;
+    dst[id]=src[j*m+i];
+}
+
+
+void reformat_s4_k_m8_tarnsscale(uint32_t* dst, const uint32_t* src, int m, int k, cudaStream_t st)
+{
+    // permutation for [k, m] layout
+    int num_kernels=k*m;
+    permute_u32<<<(num_kernels+BLOCKSIZE-1)/BLOCKSIZE,BLOCKSIZE,0,st>>>(num_kernels,dst, src,m,k);
+}
+
+
 __global__ void dequantize_s4_offset_64(uint4* dst, const uint32_t* src, size_t count)
 {
    for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < count; i += blockDim.x * gridDim.x) {
@@ -269,8 +329,7 @@ __global__ void input_padding_kernel(int num_kernels,T* output,const T* input,in
 template <typename T>
 void input_padding(cudaStream_t stream, T* output,const T* input,int m,int k,int group_size,int pad_groupcount)
 {
-    //input的size是[m,k],output的size是[m,n+group_size]
-    //
+
    int num_kernels=m*(k+pad_groupcount*group_size);
    input_padding_kernel<<<(num_kernels+BLOCKSIZE-1)/BLOCKSIZE,BLOCKSIZE,0,stream>>>(num_kernels, output,input,m,k,group_size,pad_groupcount);
 }
@@ -282,3 +341,5 @@ template void input_padding(cudaStream_t stream, T* output,const T* input,int m,
 INSTANTIATEINPUTPADING(__half)

 }  // namespace turbomind
+
+
--- a/src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h
+++ b/src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h
@@ -36,6 +36,11 @@ void dequant_w4_gemm_colmajor(cudaStream_t stream, half* output,const uint32_t*
 template <typename T>
 void input_padding(cudaStream_t stream, T* output,const T* input,int m,int k,int group_size,int pad_groupcount);

+void reformat_s4_k_m8_tarnsw4(uint32_t* dst, const uint32_t* src, int m, int k, cudaStream_t st);
+void reformat_s4_k_m8_tarnsscale(uint32_t* dst, const uint32_t* src, int m, int k, cudaStream_t st);
+template <typename T>
+void PrintData(cudaStream_t stream, const T* input,int size);
+
 class GemmS4F16 {
 public:
    GemmS4F16();

--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -17,7 +17,7 @@

 // Modified from
 // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.cc
-
+#include "src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h"
 #include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
 #include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include "src/turbomind/utils/logger.h"
@@ -42,6 +42,7 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(size_t     head_num,
                                                    WeightType weight_type,
                                                    int        group_size,
                                                    int        w4_weight_layout,
+                                                    int        w4_pad_size,
                                                    bool       attn_bias,
                                                    size_t     tensor_para_size,
                                                    size_t     tensor_para_rank):
@@ -60,36 +61,42 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(size_t     head_num,
    self_attn_weights.qkv.type        = weight_type;
    self_attn_weights.qkv.group_size  = group_size;
    self_attn_weights.qkv.w4_weight_layout  = w4_weight_layout;
+    self_attn_weights.qkv.w4_pad_size  = w4_pad_size;

    self_attn_weights.output.input_dims  = hidden_units_ / tensor_para_size_;
    self_attn_weights.output.output_dims = hidden_units_;
    self_attn_weights.output.type        = weight_type;
    self_attn_weights.output.group_size  = group_size;
    self_attn_weights.output.w4_weight_layout  = w4_weight_layout;
+    self_attn_weights.output.w4_pad_size  = w4_pad_size;

    ffn_weights.gating.input_dims  = hidden_units_;
    ffn_weights.gating.output_dims = inter_size_ / tensor_para_size_;
    ffn_weights.gating.type        = weight_type;
    ffn_weights.gating.group_size  = group_size;
    ffn_weights.gating.w4_weight_layout  = w4_weight_layout;
+    ffn_weights.gating.w4_pad_size  = w4_pad_size;

    ffn_weights.intermediate.input_dims  = hidden_units_;
    ffn_weights.intermediate.output_dims = inter_size_ / tensor_para_size_;
    ffn_weights.intermediate.type        = weight_type;
    ffn_weights.intermediate.group_size  = group_size;
    ffn_weights.intermediate.w4_weight_layout  = w4_weight_layout;
+    ffn_weights.intermediate.w4_pad_size  = w4_pad_size;

    ffn_weights.fused_gating_intermediate.input_dims  = hidden_units_;
    ffn_weights.fused_gating_intermediate.output_dims = inter_size_ / tensor_para_size_ * 2;
    ffn_weights.fused_gating_intermediate.type        = weight_type;
    ffn_weights.fused_gating_intermediate.group_size  = group_size;
    ffn_weights.fused_gating_intermediate.w4_weight_layout  = w4_weight_layout;
+    ffn_weights.fused_gating_intermediate.w4_pad_size  = w4_pad_size;

    ffn_weights.output.input_dims  = inter_size_ / tensor_para_size_;
    ffn_weights.output.output_dims = hidden_units_;
    ffn_weights.output.type        = weight_type;
    ffn_weights.output.group_size  = group_size;
    ffn_weights.output.w4_weight_layout  = w4_weight_layout;
+    ffn_weights.output.w4_pad_size  = w4_pad_size;
    mallocWeights();
 }

@@ -118,16 +125,9 @@ void mallocWeights(LlamaDenseWeight<T>& weights, bool bias)
    else {  // int8, int4
        const int factor = sizeof(float) * 8 / bit_size;
        FT_CHECK(weights.input_dims % factor == 0);
-    //    //读环境变量
-    //     int m_weightlayout_switch=1;
-    //     const char* env_weightlayout_str = std::getenv("LMDEPLOY_WEIGHTLAYOUT_SWITCH");
-    //     if (env_weightlayout_str != nullptr) {
-    //         m_weightlayout_switch = std::stoi(env_weightlayout_str);
-    //     }
-        
        if((weights.input_dims%4096==0)&&(weights.w4_weight_layout==1||weights.w4_weight_layout==2))
        {
-            size_t new_input_dims=weights.input_dims+2*weights.group_size;
+            size_t new_input_dims=weights.input_dims+weights.w4_pad_size*weights.group_size;

            deviceMalloc((int**)&weights.kernel, new_input_dims * weights.output_dims / factor);
            deviceMemSetZero((int*)weights.kernel, new_input_dims* weights.output_dims / factor);
@@ -171,15 +171,10 @@ void getWeightTensor(LlamaDenseWeight<T>& weights, bool bias, const std::string&
    }
    else {  // int8, int4
        const int factor = sizeof(float) * 8 / bit_size;
-        // //读环境变量
-        // int m_weightlayout_switch=1;
-        // const char* env_weightlayout_str = std::getenv("LMDEPLOY_WEIGHTLAYOUT_SWITCH");
-        // if (env_weightlayout_str != nullptr) {
-        //     m_weightlayout_switch = std::stoi(env_weightlayout_str);
-        // }
+
        if((weights.input_dims%4096==0)&&(weights.w4_weight_layout==1||weights.w4_weight_layout==2))
        {
-            size_t new_input_dims=weights.input_dims+weights.group_size;
+            size_t new_input_dims=weights.input_dims+ weights.w4_pad_size*weights.group_size;

            output.insert(get_name("qweight"),
                        Tensor{MEMORY_GPU,
@@ -189,7 +184,7 @@ void getWeightTensor(LlamaDenseWeight<T>& weights, bool bias, const std::string&
            output.insert(get_name("scales_zeros"),
                        Tensor{MEMORY_GPU,
                                getTensorType<T>(),
-                                {new_input_dims / weights.group_size * weights.output_dims * 2 * sizeof(T)},
+                                {new_input_dims * weights.output_dims/ weights.group_size  * 2 * sizeof(T)},
                                weights.scales_and_zeros});
        }
        else{
@@ -307,23 +302,36 @@ void loadWeights(LlamaDenseWeight<T>& w,

        FT_CHECK(dim1 % factor == 0);

-        // //读环境变量
-        // int m_weightlayout_switch=1;
-        // const char* env_weightlayout_str = std::getenv("LMDEPLOY_WEIGHTLAYOUT_SWITCH");
-        // if (env_weightlayout_str != nullptr) {
-        //     m_weightlayout_switch = std::stoi(env_weightlayout_str);
-        // }
-
-        if((dim0%4096==0)&&(w.w4_weight_layout==1||w.w4_weight_layout==2))
+        if(w.w4_weight_layout==1||w.w4_weight_layout==2)//需要转置
        {
-            size_t new_dim0=dim0+2*w.group_size;
-            std::vector<size_t> w_shape{new_dim0, dim1 / factor * sizeof(uint32_t)};
-            loadWeightFromBin((int8_t*)w.kernel, w_shape, prefix + ".qweight", FtCudaDataType::INT8, {});
-
-            const size_t group_count = w.group_size > 0 ? new_dim0 / w.group_size : 1;
-
-            loadWeightFromBin((half*)w.scales_and_zeros, {group_count, dim1 * 2}, prefix + ".scales_zeros", type, {});
+            size_t new_dim0=dim0;
+            if(dim0%4096==0) new_dim0=dim0+w.w4_pad_size*w.group_size;
+            
+            //申请内存
+            int* kernel_workspace=nullptr;
+            half* scales_workspace=nullptr;
+
+            deviceMalloc((int**)&kernel_workspace, new_dim0 * dim1 / factor);
+            deviceMemSetZero((int*)kernel_workspace, new_dim0* dim1 / factor);
+            deviceMalloc((half**)&scales_workspace, new_dim0 / w.group_size * dim1 * 2);
+
+            //加载weight
+            std::vector<size_t> w_shape{dim0, dim1 / factor * sizeof(uint32_t)};
+            loadWeightFromBin((int8_t*)kernel_workspace, w_shape, prefix + ".qweight", FtCudaDataType::INT8, {});
+            const size_t group_count = w.group_size > 0 ? dim0 / w.group_size : 1;
+            loadWeightFromBin((half*)scales_workspace, {group_count, dim1 * 2}, prefix + ".scales_zeros", type, {});  
+
+            //转置
+            reformat_s4_k_m8_tarnsw4((uint32_t*)w.kernel,(uint32_t*)kernel_workspace,dim1,new_dim0,0);
+            reformat_s4_k_m8_tarnsscale((uint32_t*)w.scales_and_zeros,(uint32_t*)scales_workspace,dim1,new_dim0/w.group_size,0);
+            
+            //释放内存
+            cudaFree(kernel_workspace);
+            cudaFree(scales_workspace);
+            kernel_workspace           = nullptr;
+            scales_workspace           = nullptr;
        }
+
        else{
            std::vector<size_t> w_shape{dim0, dim1 / factor * sizeof(uint32_t)};
            loadWeightFromBin((int8_t*)w.kernel, w_shape, prefix + ".qweight", FtCudaDataType::INT8, {});
@@ -332,9 +340,57 @@ void loadWeights(LlamaDenseWeight<T>& w,

            loadWeightFromBin((half*)w.scales_and_zeros, {group_count, dim1 * 2}, prefix + ".scales_zeros", type, {});
        }
+        //在这里进行weight的pad以及转置
+    }
+}
+
+template<typename T>
+void transWeights(LlamaDenseWeight<T>& w,
+                 FtCudaDataType       model_file_type)
+{
+    
+    const auto type       = model_file_type;
+
+    size_t dim0 = w.input_dims;
+    size_t dim1 = w.output_dims;
+
+    const size_t bit_size = getBitSize(w.type);
+    const int factor = sizeof(float) * 8 / bit_size;
+
+    FT_CHECK(dim1 % factor == 0);
+
+    if(w.w4_weight_layout==1||w.w4_weight_layout==2)//需要转置
+    {
+        size_t new_dim0=dim0;
+
+        if(dim0%4096==0) new_dim0=dim0+w.w4_pad_size*w.group_size;
+    
+        //申请内存
+        int* kernel_workspace=nullptr;
+        half* scales_workspace=nullptr;
+
+        deviceMalloc((int**)&kernel_workspace, new_dim0 * dim1 / factor);
+        deviceMemSetZero((int*)kernel_workspace, new_dim0* dim1 / factor);
+        deviceMalloc((half**)&scales_workspace, new_dim0 / w.group_size * dim1 * 2);
+        deviceMemSetZero((half*)scales_workspace, new_dim0 / w.group_size * dim1 * 2);
+
+        //拷贝加载weight
+        cudaD2Dcpy((int*)kernel_workspace,(int*)w.kernel, dim0* dim1 / factor);
+        cudaD2Dcpy((half*)scales_workspace,(half*)w.scales_and_zeros, dim0 / w.group_size * dim1 * 2);
+        
+        //转置
+        reformat_s4_k_m8_tarnsw4((uint32_t*)w.kernel,(uint32_t*)kernel_workspace,dim1,new_dim0,0);
+        reformat_s4_k_m8_tarnsscale((uint32_t*)w.scales_and_zeros,(uint32_t*)scales_workspace,dim1,new_dim0/w.group_size,0);
+        
+        //释放内存
+        cudaFree(kernel_workspace);
+        cudaFree(scales_workspace);
+        kernel_workspace           = nullptr;
+        scales_workspace           = nullptr;
    }
 }

+
 template<typename T>
 void LlamaDecoderLayerWeight<T>::mallocWeights()
 {
@@ -420,6 +476,19 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
    }
 }

+template<typename T>
+void LlamaDecoderLayerWeight<T>::modifyModel(FtCudaDataType model_file_type)
+{
+    const auto rank_spec = std::to_string(tensor_para_rank_);
+    const auto type      = model_file_type;
+
+    transWeights(self_attn_weights.qkv,type);
+    transWeights(self_attn_weights.output,type); 
+    transWeights(ffn_weights.fused_gating_intermediate,type);
+    transWeights(ffn_weights.output,type);
+}
+
+
 template<typename T>
 TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
 {

--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
@@ -36,6 +36,7 @@ public:
                            WeightType weight_type,
                            int        group_size,
                            int        w4_weight_layout,
+                            int        w4_pad_size,
                            bool       attn_bias,
                            size_t     tensor_para_size,
                            size_t     tensor_para_rank);
@@ -44,6 +45,7 @@ public:
    LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other) = delete;

    void loadModel(std::string dir_path, FtCudaDataType model_file_type);
+    void modifyModel(FtCudaDataType model_file_type);

    TensorMap getParams(std::string prefix);


--- a/src/turbomind/models/llama/LlamaDenseWeight.h
+++ b/src/turbomind/models/llama/LlamaDenseWeight.h
@@ -64,6 +64,7 @@ struct LlamaDenseWeight {
    T*         scales_and_zeros;
    int        group_size;
    int        w4_weight_layout;
+    int        w4_pad_size;
 };

 template<typename T>

--- a/src/turbomind/models/llama/LlamaLinear.h
+++ b/src/turbomind/models/llama/LlamaLinear.h
@@ -117,19 +117,19 @@ private:
                //检查xpad空间是否足够
                if(weight.input_dims%4096==0) //需要进行pad
                {
-                    int pad_group_count=2;
-                    input_padding(stream_,reinterpret_cast<half*>(cublas_wrapper_->xpading_workspace_),(const T*)input_data,batch_size,weight.input_dims,weight.group_size,pad_group_count);
-                    dequant_w4_gemm_colmajor(stream_,reinterpret_cast<T*>(cublas_wrapper_->deweight_workspace_),(const uint32_t*)weight.kernel,(const half2*)weight.scales_and_zeros,weight.input_dims+pad_group_count*weight.group_size ,weight.output_dims,weight.group_size);
+
+                    input_padding(stream_,reinterpret_cast<half*>(cublas_wrapper_->xpading_workspace_),(const T*)input_data,batch_size,weight.input_dims,weight.group_size,weight.w4_pad_size);
+                    dequant_w4_gemm_colmajor(stream_,reinterpret_cast<T*>(cublas_wrapper_->deweight_workspace_),(const uint32_t*)weight.kernel,(const half2*)weight.scales_and_zeros,weight.input_dims+weight.w4_pad_size*weight.group_size ,weight.output_dims,weight.group_size);
 
                    cublas_wrapper_->Gemm(CUBLAS_OP_T,
                        CUBLAS_OP_N,
                        weight.output_dims,//m
                        batch_size,//n
-                        weight.input_dims+pad_group_count*weight.group_size,//k
+                        weight.input_dims+weight.w4_pad_size*weight.group_size,//k
                        (const T*) reinterpret_cast<T*>(cublas_wrapper_->deweight_workspace_), //[]
-                        weight.input_dims+pad_group_count*weight.group_size, //k
+                        weight.input_dims+weight.w4_pad_size*weight.group_size, //k
                        (const T*) cublas_wrapper_->xpading_workspace_,
-                        weight.input_dims+pad_group_count*weight.group_size, //k
+                        weight.input_dims+weight.w4_pad_size*weight.group_size, //k
                        output_data,
                        weight.output_dims); //m 
                }
@@ -155,8 +155,7 @@ private:
                //检查ck workspace 的空间是否足够
                if(weight.input_dims%4096==0)
                {
-                    int pad_groupcount=2;
-                    run_weight_only_gemm(reinterpret_cast<const void*>(input_data), reinterpret_cast<const void*>(weight.kernel), reinterpret_cast<const void*>(weight.scales_and_zeros), reinterpret_cast<void*> (output_data), batch_size, weight.output_dims, (weight.input_dims), (weight.input_dims),(weight.input_dims), (weight.input_dims+pad_groupcount*weight.group_size), weight.output_dims, weight.group_size,reinterpret_cast<void*>(cublas_wrapper_->ck_workspace_),CK_WORKSPACE_SIZE,(hipStream_t)stream_);
+                    run_weight_only_gemm(reinterpret_cast<const void*>(input_data), reinterpret_cast<const void*>(weight.kernel), reinterpret_cast<const void*>(weight.scales_and_zeros), reinterpret_cast<void*> (output_data), batch_size, weight.output_dims, (weight.input_dims), (weight.input_dims),(weight.input_dims), (weight.input_dims+weight.w4_pad_size*weight.group_size), weight.output_dims, weight.group_size,reinterpret_cast<void*>(cublas_wrapper_->ck_workspace_),CK_WORKSPACE_SIZE,(hipStream_t)stream_);
                }
                //                                            A                                            B0                                        B1                                            C                   M                   N                K                 strideA             strideB    strideBpad        strideC           group_size                            
               else{
@@ -208,19 +207,19 @@ private:
                //检查xpad空间是否足够
                if(weight.input_dims%4096==0) //需要进行pad
                {
-                    int pad_group_count=2;
-                    input_padding<T>(stream_,reinterpret_cast<half*>(cublas_wrapper_->xpading_workspace_),(const T*)input_data,batch_size,weight.input_dims,weight.group_size,pad_group_count);
-                    dequant_w4_gemm_colmajor(stream_,reinterpret_cast<T*>(cublas_wrapper_->deweight_workspace_),(const uint32_t*)weight.kernel,(const half2*)weight.scales_and_zeros,weight.input_dims+pad_group_count*weight.group_size,weight.output_dims,weight.group_size);
+
+                    input_padding<T>(stream_,reinterpret_cast<half*>(cublas_wrapper_->xpading_workspace_),(const T*)input_data,batch_size,weight.input_dims,weight.group_size,weight.w4_pad_size);
+                    dequant_w4_gemm_colmajor(stream_,reinterpret_cast<T*>(cublas_wrapper_->deweight_workspace_),(const uint32_t*)weight.kernel,(const half2*)weight.scales_and_zeros,weight.input_dims+weight.w4_pad_size*weight.group_size,weight.output_dims,weight.group_size);
 
                    cublas_wrapper_->Gemm(CUBLAS_OP_T,
                        CUBLAS_OP_N,
                        weight.output_dims,//m
                        batch_size,//n
-                        weight.input_dims+pad_group_count*weight.group_size,//k
+                        weight.input_dims+weight.w4_pad_size*weight.group_size,//k
                        (const T*) reinterpret_cast<T*>(cublas_wrapper_->deweight_workspace_), //[]
-                        weight.input_dims+pad_group_count*weight.group_size, //k
+                        weight.input_dims+weight.w4_pad_size*weight.group_size, //k
                        (const T*) cublas_wrapper_->xpading_workspace_,
-                        weight.input_dims+pad_group_count*weight.group_size, //k
+                        weight.input_dims+weight.w4_pad_size*weight.group_size, //k
                        output_tmp,
                        weight.output_dims); //m 
                }
@@ -246,8 +245,8 @@ private:

                if(weight.input_dims%4096==0)
                {
-                    int pad_groupcount=2;
-                    run_weight_only_gemm(reinterpret_cast<const void*>(input_data), reinterpret_cast<const void*>(weight.kernel), reinterpret_cast<const void*>(weight.scales_and_zeros), reinterpret_cast<void*> (output_tmp), batch_size, weight.output_dims, (weight.input_dims), (weight.input_dims),(weight.input_dims), (weight.input_dims+pad_groupcount*weight.group_size), weight.output_dims, weight.group_size,reinterpret_cast<void*>(cublas_wrapper_->ck_workspace_),CK_WORKSPACE_SIZE,(hipStream_t)stream_);
+                    
+                    run_weight_only_gemm(reinterpret_cast<const void*>(input_data), reinterpret_cast<const void*>(weight.kernel), reinterpret_cast<const void*>(weight.scales_and_zeros), reinterpret_cast<void*> (output_tmp), batch_size, weight.output_dims, (weight.input_dims), (weight.input_dims),(weight.input_dims), (weight.input_dims+weight.w4_pad_size*weight.group_size), weight.output_dims, weight.group_size,reinterpret_cast<void*>(cublas_wrapper_->ck_workspace_),CK_WORKSPACE_SIZE,(hipStream_t)stream_);
                }
                //                                            A                                            B0                                        B1                                            C                   M                   N                K                 strideA             strideB    strideBpad        strideC           group_size                            
               else{

--- a/src/turbomind/models/llama/LlamaWeight.cc
+++ b/src/turbomind/models/llama/LlamaWeight.cc
@@ -33,6 +33,7 @@ LlamaWeight<T>::LlamaWeight(size_t     head_num,
                            WeightType weight_type,
                            int        group_size,
                            int        w4_weight_layout,
+                            int        w4_pad_size,
                            size_t     tensor_para_size,
                            size_t     tensor_para_rank):
    hidden_units_(head_num * size_per_head),
@@ -57,6 +58,7 @@ LlamaWeight<T>::LlamaWeight(size_t     head_num,
                                                                       weight_type_,
                                                                       group_size,
                                                                       w4_weight_layout,
+                                                                       w4_pad_size,
                                                                       attn_bias,
                                                                       tensor_para_size_,
                                                                       tensor_para_rank_));
@@ -69,7 +71,7 @@ LlamaWeight<T>::LlamaWeight(size_t     head_num,
        std::string str_w4_weight_layout = std::to_string(w4_weight_layout);
        const char* env_value = str_w4_weight_layout.c_str();
        setenv(env_name,env_value , 1);
-        //printf("set LMDEPLOY_WEIGHTLAYOUT_SWITCH env: %d \n",w4_weight_layout);
+
    }
    else
    {
@@ -128,8 +130,23 @@ void LlamaWeight<T>::loadModel(std::string dir_path)
    for (unsigned layer = 0; layer < num_layer_; ++layer) {
        decoder_layer_weights[layer]->loadModel(dir_path + "layers." + std::to_string(layer), model_file_type);
    }
+
+}
+
+template<typename T>
+void LlamaWeight<T>::modifyModel()
+{
+    FtCudaDataType model_file_type = FtCudaDataType::FP16;
+    if(weight_type_ == WeightType::kBF16){
+        model_file_type = FtCudaDataType::BF16;
+    }
+
+    for (unsigned layer = 0; layer < num_layer_; ++layer) {
+        decoder_layer_weights[layer]->modifyModel(model_file_type);
+    }
 }

+
 template<typename T>
 TensorMap LlamaWeight<T>::getParams()
 {

--- a/src/turbomind/models/llama/LlamaWeight.h
+++ b/src/turbomind/models/llama/LlamaWeight.h
@@ -38,6 +38,7 @@ struct LlamaWeight {
                WeightType weight_type,
                int        group_size,
                int        w4_weight_layout,
+                int        w4_pad_size,
                size_t     tensor_para_size,
                size_t     tensor_para_rank);

@@ -47,6 +48,7 @@ struct LlamaWeight {
    LlamaWeight& operator=(const LlamaWeight& other) = delete;

    void loadModel(std::string dir_path);
+    void modifyModel();

    TensorMap getParams();


--- a/src/turbomind/python/bind.cpp
+++ b/src/turbomind/python/bind.cpp
@@ -439,6 +439,11 @@ PYBIND11_MODULE(_turbomind, m)
             py::call_guard<py::gil_scoped_release>(),
             "device_id"_a,
             "rank"_a)
+        .def("modify_shared_weights",
+             &AbstractTransformerModel::modifySharedWeights,
+             py::call_guard<py::gil_scoped_release>(),
+             "device_id"_a,
+             "rank"_a)
        .def(
            "get_params",
            [](AbstractTransformerModel* model, int deviceId, int rank) {

--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -187,6 +187,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
    quant_policy_ = reader.GetInteger("llama", "quant_policy", 0);
    group_size_   = reader.GetInteger("llama", "group_size", 0);
    w4_weight_layout_   = reader.GetInteger("llama", "w4_weight_layout", 2);
+    w4_pad_size_ = reader.GetInteger("llama", "w4_pad_size", 2);

    // rotary embedding parameters
    attn_params_.rotary_embedding_dim    = reader.GetInteger("llama", "rotary_embedding");
@@ -383,6 +384,7 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
                                                                      weight_type_,
                                                                      group_size_,
                                                                      w4_weight_layout_,
+                                                                      w4_pad_size_,
                                                                      tensor_para_size_,
                                                                      tensor_para_rank);
    // model inited with model_dir
@@ -392,6 +394,21 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
    return;
 }

+template<typename T>
+void LlamaTritonModel<T>::modifySharedWeights(int device_id, int rank)
+{
+    ft::check_cuda_error(cudaSetDevice(device_id));
+    const int tensor_para_rank   = rank % tensor_para_size_;
+    const int pipeline_para_rank = rank / tensor_para_size_;
+    ft::FT_CHECK(pipeline_para_size_ == 1 && pipeline_para_rank == 0);
+
+    if(weight_type_== turbomind::WeightType::kINT4)
+    {
+        shared_weights_[device_id]->modifyModel();
+    }
+    return;
+}
+
 template<typename T>
 TensorMap LlamaTritonModel<T>::getParams(int deviceId, int rank)
 {

--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
@@ -53,6 +53,7 @@ struct LlamaTritonModel: public AbstractTransformerModel {
                        std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm = nullptr) override;

    void createSharedWeights(int deviceId, int rank) override;
+    void modifySharedWeights(int deviceId, int rank) override;

    TensorMap getParams(int deviceId, int rank) override;

@@ -102,6 +103,7 @@ private:
    int                             quant_policy_;
    int                             group_size_;
    int                             w4_weight_layout_;
+    int                             w4_pad_size_;
    
    // shared weights for each device
    std::vector<std::shared_ptr<ft::LlamaWeight<T>>> shared_weights_;

--- a/src/turbomind/triton_backend/transformer_triton_backend.hpp
+++ b/src/turbomind/triton_backend/transformer_triton_backend.hpp
@@ -325,6 +325,7 @@ struct AbstractTransformerModel {
                        std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm = nullptr) = 0;

    virtual void createSharedWeights(int deviceId, int rank) = 0;
+    virtual void modifySharedWeights(int deviceId, int rank) = 0;

    virtual TensorMap getParams(int deviceId, int rank) = 0;