build turbomind (#35)

* build turbomind * change namespace fastertransformer to turbomind * change logger name

build turbomind (#35)
* build turbomind * change namespace fastertransformer to turbomind * change logger name
35d64462 · lvhan028 · GitHub · 53d2e42c · 35d64462 · 35d64462
Unverified Commit 35d64462 authored Jul 01, 2023 by lvhan028 Committed by GitHub Jul 01, 2023
20 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@ repos:
    rev: 4.0.1
    hooks:
      - id: flake8
-        args: ["--exclude=llama_service/fastertransformer/triton_model/llama_models/*, configs/*"]
+        args: ["--exclude=lmdeploy/turbomind/triton_models/*"]
  - repo: https://github.com/PyCQA/isort
    rev: 5.11.5
    hooks:

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,7 +53,7 @@ set(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 FetchContent_MakeAvailable(repo-cutlass)

 set(CUTLASS_HEADER_DIR ${PROJECT_SOURCE_DIR}/3rdparty/cutlass/include)
-set(CUTLASS_EXTENSIONS_DIR ${PROJECT_SOURCE_DIR}/src/fastertransformer/cutlass_extensions/include)
+set(CUTLASS_EXTENSIONS_DIR ${PROJECT_SOURCE_DIR}/src/turbomind/cutlass_extensions/include)

 option(SPARSITY_SUPPORT "Build project with Ampere sparsity feature support" OFF)


--- a/README.md
+++ b/README.md
@@ -98,7 +98,7 @@ Run one of the following commands to serve a LLaMA model on NVIDIA GPU server:
 <summary><b>7B</b></summary>

 ```shell
-python3 lmdeploy/serve/fastertransformer/deploy.py llama-7B /path/to/llama-7b llama \
+python3 lmdeploy/serve/turbomind/deploy.py llama-7B /path/to/llama-7b llama \
    --tokenizer_path /path/to/tokenizer/model
 bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
 ```
@@ -109,7 +109,7 @@ bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fast
 <summary><b>13B</b></summary>

 ```shell
-python3 lmdeploy/serve/fastertransformer/deploy.py llama-13B /path/to/llama-13b llama \
+python3 lmdeploy/serve/turbomind/deploy.py llama-13B /path/to/llama-13b llama \
    --tokenizer_path /path/to/tokenizer/model --tp 2
 bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
 ```
@@ -128,7 +128,7 @@ python3 -m fastchat.model.apply_delta \
  --target-model-path /path/to/vicuna-7b \
  --delta-path lmsys/vicuna-7b-delta-v1.1

-python3 lmdeploy/serve/fastertransformer/deploy.py vicuna-7B /path/to/vicuna-7b hf
+python3 lmdeploy/serve/turbomind/deploy.py vicuna-7B /path/to/vicuna-7b hf
 bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
 ```

@@ -144,7 +144,7 @@ python3 -m fastchat.model.apply_delta \
  --target-model-path /path/to/vicuna-13b \
  --delta-path lmsys/vicuna-13b-delta-v1.1

-python3 lmdeploy/serve/fastertransformer/deploy.py vicuna-13B /path/to/vicuna-13b hf
+python3 lmdeploy/serve/turbomind/deploy.py vicuna-13B /path/to/vicuna-13b hf
 bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
 ```


--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -96,7 +96,7 @@ make -j$(nproc) && make install
 <summary><b>7B</b></summary>

 ```shell
-python3 lmdeploy/serve/fastertransformer/deploy.py llama-7B /path/to/llama-7b llama \
+python3 lmdeploy/serve/turbomind/deploy.py llama-7B /path/to/llama-7b llama \
    --tokenizer_path /path/to/tokenizer/model
 bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
 ```
@@ -107,7 +107,7 @@ bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fast
 <summary><b>13B</b></summary>

 ```shell
-python3 lmdeploy/serve/fastertransformer/deploy.py llama-13B /path/to/llama-13b llama \
+python3 lmdeploy/serve/turbomind/deploy.py llama-13B /path/to/llama-13b llama \
    --tokenizer_path /path/to/tokenizer/model --tp 2
 bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
 ```
@@ -126,7 +126,7 @@ python3 -m fastchat.model.apply_delta \
  --target-model-path /path/to/vicuna-7b \
  --delta-path lmsys/vicuna-7b-delta-v1.1

-python3 lmdeploy/serve/fastertransformer/deploy.py vicuna-7B /path/to/vicuna-7b hf
+python3 lmdeploy/serve/turbomind/deploy.py vicuna-7B /path/to/vicuna-7b hf
 bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
 ```

@@ -142,7 +142,7 @@ python3 -m fastchat.model.apply_delta \
  --target-model-path /path/to/vicuna-13b \
  --delta-path lmsys/vicuna-13b-delta-v1.1

-python3 lmdeploy/serve/fastertransformer/deploy.py vicuna-13B /path/to/vicuna-13b hf
+python3 lmdeploy/serve/turbomind/deploy.py vicuna-13B /path/to/vicuna-13b hf
 bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
 ```


--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -21,16 +21,16 @@
 #include <memory>
 #include <thread>

-#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h"
-#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
-#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
-#include "src/fastertransformer/utils/custom_ar_comm.h"
-#include "src/fastertransformer/utils/mpi_utils.h"
-#include "src/fastertransformer/utils/nccl_utils.h"
-#include "src/fastertransformer/utils/nvtx_utils.h"
-#include "src/fastertransformer/utils/word_list.h"
-
-namespace ft = fastertransformer;
+#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
+#include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
+#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
+#include "src/turbomind/utils/custom_ar_comm.h"
+#include "src/turbomind/utils/mpi_utils.h"
+#include "src/turbomind/utils/nccl_utils.h"
+#include "src/turbomind/utils/nvtx_utils.h"
+#include "src/turbomind/utils/word_list.h"
+
+namespace ft = turbomind;

 constexpr const bool kUSE_MPI = true;


--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@ if __name__ == '__main__':
          author='OpenMMLab',
          author_email='openmmlab@gmail.com',
          packages=find_packages(
-              exclude=('lmdeploy/serve/fastertransformer/triton_models', )),
+              exclude=('lmdeploy/serve/turbomind/triton_models', )),
          classifiers=[
              'Programming Language :: Python :: 3.8',
              'Programming Language :: Python :: 3.9',

--- a/src/turbomind/kernels/activation_kernels.cu
+++ b/src/turbomind/kernels/activation_kernels.cu
@@ -14,16 +14,16 @@
 * limitations under the License.
 */

-#include "src/fastertransformer/kernels/activation_kernels.h"
-#include "src/fastertransformer/utils/cuda_type_utils.cuh"
-#include "src/fastertransformer/utils/cuda_utils.h"
-#include "src/fastertransformer/utils/memory_utils.h"
+#include "src/turbomind/kernels/activation_kernels.h"
+#include "src/turbomind/utils/cuda_type_utils.cuh"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/memory_utils.h"

 #ifndef CUDART_VERSION
 #error CUDART_VERSION Undefined!
 #endif

-namespace fastertransformer {
+namespace turbomind {

 /* Gelu Activation */

@@ -255,8 +255,8 @@ void invokeGenericActivation(T*           out,
                             const int    seq_len,
                             cudaStream_t stream)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
-    FT_LOG_DEBUG("invokeGenericActivation %d %d %d", m, n, seq_len);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG("invokeGenericActivation %d %d %d", m, n, seq_len);
    using PT                   = typename packed_type<T>::type;
    constexpr int packed_elems = num_elems<PT>::value;
    using PBT                  = typename packed_as<BT, packed_elems>::type;
@@ -272,7 +272,7 @@ void invokeGenericActivation(T*           out,
        block.x = n_threads;
        grid.x  = ceil(m * n / double(n_threads));
    }
-    FT_LOG_DEBUG("%d %d", grid.x, block.x);
+    TM_LOG_DEBUG("%d %d", grid.x, block.x);
    sync_check_cuda_error();
    generic_activation<Activation><<<grid, block, 0, stream>>>(reinterpret_cast<PT*>(out),
                                                               reinterpret_cast<const PBT*>(bias),
@@ -655,4 +655,4 @@ void invokeSigmoid(T* data, const int size, const float scale, cudaStream_t stre
 template void invokeSigmoid(float* data, const int size, const float scale, cudaStream_t stream);
 template void invokeSigmoid(half* data, const int size, const float scale, cudaStream_t stream);

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/kernels/activation_kernels.h
+++ b/src/turbomind/kernels/activation_kernels.h
@@ -16,12 +16,12 @@

 #pragma once

-#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/turbomind/utils/cuda_bf16_wrapper.h"
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
 #include <stdlib.h>

-namespace fastertransformer {
+namespace turbomind {

 // clang-format off
 template<typename T> struct GeluActivation;
@@ -107,4 +107,4 @@ void invokeAddBiasTanh(T* out, const T* bias, const int m, const int n, cudaStre
 template<typename T>
 void invokeSigmoid(T* data, const int size, const float scale, cudaStream_t stream);

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/kernels/ban_bad_words.cu
+++ b/src/turbomind/kernels/ban_bad_words.cu
@@ -14,10 +14,10 @@
 * limitations under the License.
 */

-#include "src/fastertransformer/kernels/ban_bad_words.h"
-#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/turbomind/kernels/ban_bad_words.h"
+#include "src/turbomind/utils/cuda_utils.h"

-namespace fastertransformer {
+namespace turbomind {

 template<typename T>
 __global__ void ban_bad_words(T*         logits,
@@ -161,4 +161,4 @@ template void invokeBanBadWords(float*       logits,
                                size_t       step,
                                cudaStream_t stream);

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/kernels/ban_bad_words.h
+++ b/src/turbomind/kernels/ban_bad_words.h
@@ -19,7 +19,7 @@
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>

-namespace fastertransformer {
+namespace turbomind {

 template<typename T>
 void invokeBanBadWords(T*           logits,
@@ -36,4 +36,4 @@ void invokeBanBadWords(T*           logits,
                       size_t       step,
                       cudaStream_t stream);

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/kernels/beam_search_penalty_kernels.cu
+++ b/src/turbomind/kernels/beam_search_penalty_kernels.cu
@@ -16,10 +16,10 @@

 #include <assert.h>

-#include "src/fastertransformer/kernels/beam_search_penalty_kernels.h"
-#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
+#include "src/turbomind/kernels/beam_search_penalty_kernels.h"
+#include "src/turbomind/kernels/reduce_kernel_utils.cuh"

-namespace fastertransformer {
+namespace turbomind {

 template<typename T>
 __global__ void add_bias_temperature(T*          logits,
@@ -310,4 +310,4 @@ template void invokeAddBiasApplyPenalties(int                         step,
                                          const int                   min_length,
                                          cudaStream_t                stream);

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/kernels/beam_search_penalty_kernels.h
+++ b/src/turbomind/kernels/beam_search_penalty_kernels.h
@@ -17,10 +17,10 @@

 #include <cuda_fp16.h>

-#include "src/fastertransformer/kernels/penalty_types.h"
-#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/turbomind/kernels/penalty_types.h"
+#include "src/turbomind/utils/cuda_utils.h"

-namespace fastertransformer {
+namespace turbomind {

 template<typename T>
 void invokeAddBiasApplyPenalties(int                         step,
@@ -45,4 +45,4 @@ void invokeAddBiasApplyPenalties(int                         step,
                                 const int                   min_length,
                                 cudaStream_t                stream);

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/kernels/beam_search_topk_kernels.cu
+++ b/src/turbomind/kernels/beam_search_topk_kernels.cu
@@ -22,13 +22,13 @@
 #include "3rdparty/cub/cub.cuh"
 #endif

-#include "src/fastertransformer/kernels/beam_search_topk_kernels.h"
-#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
-#include "src/fastertransformer/utils/cuda_type_utils.cuh"
-#include "src/fastertransformer/utils/cuda_utils.h"
-#include "src/fastertransformer/utils/logger.h"
+#include "src/turbomind/kernels/beam_search_topk_kernels.h"
+#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
+#include "src/turbomind/utils/cuda_type_utils.cuh"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/logger.h"

-namespace fastertransformer {
+namespace turbomind {

 template<typename T>
 __device__ __forceinline__ T apply_length_penalty(T log_prob, int length, float length_penalty)
@@ -595,7 +595,7 @@ void invokeTopkBeamSearch(void*           workspace,
                          const int*      end_ids,
                          cudaStream_t    stream)
 {
-    FT_LOG_DEBUG("%s", __PRETTY_FUNCTION__);
+    TM_LOG_DEBUG("%s", __PRETTY_FUNCTION__);
    // log_probs: (batch, beam, vocab) cumulative log_probs of beams ending with a token.
    const int vocab_size = vocab_size_padded_;
    // Beam size should be less than or equal to vocab size.
@@ -842,4 +842,4 @@ void invokeInsertUnfinishedPath(BeamHypotheses beam_hyps,
    insertUnfinishedPath<<<batch_size, 256, 0, stream>>>(beam_hyps, finished, cum_log_probs, batch_size, beam_width);
 }

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/kernels/beam_search_topk_kernels.h
+++ b/src/turbomind/kernels/beam_search_topk_kernels.h
@@ -18,7 +18,7 @@

 #pragma once

-namespace fastertransformer {
+namespace turbomind {

 // In original beam search implementation, if a beam is finished, we set it as finished
 // and only continue to do beam search on remain beams (namely, beam_width - 1 beams in next step)
@@ -91,4 +91,4 @@ void invokeInsertUnfinishedPath(BeamHypotheses beam_hyps,
                                const int      beam_width,
                                cudaStream_t   stream);

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/kernels/bert_preprocess_kernels.cu
+++ b/src/turbomind/kernels/bert_preprocess_kernels.cu
@@ -15,11 +15,11 @@
 */

 #include "bert_preprocess_kernels.h"
-#include "src/fastertransformer/utils/cuda_bf16_fallbacks.cuh"
-#include "src/fastertransformer/utils/cuda_fp8_utils.h"
-#include "src/fastertransformer/utils/cuda_type_utils.cuh"
+#include "src/turbomind/utils/cuda_bf16_fallbacks.cuh"
+#include "src/turbomind/utils/cuda_fp8_utils.h"
+#include "src/turbomind/utils/cuda_type_utils.cuh"

-namespace fastertransformer {
+namespace turbomind {

 __global__ void getPaddingOffsetAndCuSeqLensKernel(size_t*    h_valid_word_num,
                                                   int*       tmp_mask_offset,
@@ -467,4 +467,4 @@ template void invokeQuantizeMatrixRebuildPadding<half, __nv_fp8_e4m3, QUANTIZE_M

 #endif

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/kernels/bert_preprocess_kernels.h
+++ b/src/turbomind/kernels/bert_preprocess_kernels.h
@@ -15,15 +15,15 @@
 */

 #pragma once
-#include "src/fastertransformer/kernels/gen_relative_pos_bias.h"
-#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/turbomind/kernels/gen_relative_pos_bias.h"
+#include "src/turbomind/utils/cuda_utils.h"
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
 #ifdef ENABLE_FP8
-#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#include "src/turbomind/utils/cuda_fp8_utils.h"
 #endif  // ENABLE_FP8

-namespace fastertransformer {
+namespace turbomind {

 void invokeGetPaddingOffsetAndCuSeqLens(size_t*      h_pinned_token_num,
                                        size_t*      h_token_num,
@@ -111,4 +111,4 @@ template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
 void invokeQuantizeMatrixRebuildPadding(QuantizeMatrixRebuildPaddingParam<T_OUT, T_IN, quantize_mode> param);
 #endif  // ENABLE_FP8

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/kernels/custom_ar_kernels.cu
+++ b/src/turbomind/kernels/custom_ar_kernels.cu
@@ -15,9 +15,9 @@
 */

 #include "custom_ar_kernels.h"
-#include "src/fastertransformer/utils/cuda_type_utils.cuh"
+#include "src/turbomind/utils/cuda_type_utils.cuh"

-namespace fastertransformer {
+namespace turbomind {

 ////////////////////////////////////////////////////////////////////////////////////////////////////

@@ -395,4 +395,4 @@ template void invokeOneOrTwoShotAllReduceKernel<__nv_bfloat16>(AllReduceParams<_
                                                               cudaStream_t                    stream);
 #endif
 template void invokeOneOrTwoShotAllReduceKernel<uint32_t>(AllReduceParams<uint32_t>& param, cudaStream_t stream);
-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/kernels/custom_ar_kernels.h
+++ b/src/turbomind/kernels/custom_ar_kernels.h
@@ -21,7 +21,7 @@

 #include <iostream>

-#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/turbomind/utils/cuda_utils.h"

 #define CUSTOM_AR_SIZE_THRESHOLD 50331648
 #define MAX_ALL_REDUCE_BLOCKS 24
@@ -31,7 +31,7 @@
 #define DEFAULT_BLOCK_SIZE 1024
 #define DEFALUT_ALGO_AR_SIZE_THRESHOLD 393216

-namespace fastertransformer {
+namespace turbomind {

 #ifdef ENABLE_BF16
 typedef struct bf168 {
@@ -60,4 +60,4 @@ void invokeOneOrTwoShotAllReduceKernel(AllReduceParams<T>& param, cudaStream_t s

 void kernelLaunchConfig(int& blocks_per_grid, int& threads_per_block, size_t elts, int kernel_algo);

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/kernels/decoder_masked_multihead_attention.cu
+++ b/src/turbomind/kernels/decoder_masked_multihead_attention.cu
@@ -14,10 +14,10 @@
 * limitations under the License.
 */

-#include "src/fastertransformer/kernels/decoder_masked_multihead_attention.h"
-#include "src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh"
-#include "src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h"
-#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/turbomind/kernels/decoder_masked_multihead_attention.h"
+#include "src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh"
+#include "src/turbomind/kernels/decoder_masked_multihead_attention_utils.h"
+#include "src/turbomind/utils/cuda_bf16_wrapper.h"
 #include <assert.h>
 #include <float.h>
 #include <type_traits>

--- a/src/turbomind/kernels/decoder_masked_multihead_attention.h
+++ b/src/turbomind/kernels/decoder_masked_multihead_attention.h
@@ -16,9 +16,9 @@

 #pragma once

-#include "src/fastertransformer/layers/attention_layers_fp8/AttentionFP8Weight.h"
-#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
-#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#include "src/turbomind/layers/attention_layers_fp8/AttentionFP8Weight.h"
+#include "src/turbomind/utils/cuda_bf16_wrapper.h"
+#include "src/turbomind/utils/cuda_fp8_utils.h"
 #include <cuda_fp16.h>
 #include <cuda_runtime_api.h>
 #include <stdint.h>