Unverified Commit 35d64462 authored by lvhan028's avatar lvhan028 Committed by GitHub
Browse files

build turbomind (#35)

* build turbomind

* change namespace fastertransformer to turbomind

* change logger name
parent 53d2e42c
......@@ -3,7 +3,7 @@ repos:
rev: 4.0.1
hooks:
- id: flake8
args: ["--exclude=llama_service/fastertransformer/triton_model/llama_models/*, configs/*"]
args: ["--exclude=lmdeploy/turbomind/triton_models/*"]
- repo: https://github.com/PyCQA/isort
rev: 5.11.5
hooks:
......
......@@ -53,7 +53,7 @@ set(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
FetchContent_MakeAvailable(repo-cutlass)
set(CUTLASS_HEADER_DIR ${PROJECT_SOURCE_DIR}/3rdparty/cutlass/include)
set(CUTLASS_EXTENSIONS_DIR ${PROJECT_SOURCE_DIR}/src/fastertransformer/cutlass_extensions/include)
set(CUTLASS_EXTENSIONS_DIR ${PROJECT_SOURCE_DIR}/src/turbomind/cutlass_extensions/include)
option(SPARSITY_SUPPORT "Build project with Ampere sparsity feature support" OFF)
......
......@@ -98,7 +98,7 @@ Run one of the following commands to serve a LLaMA model on NVIDIA GPU server:
<summary><b>7B</b></summary>
```shell
python3 lmdeploy/serve/fastertransformer/deploy.py llama-7B /path/to/llama-7b llama \
python3 lmdeploy/serve/turbomind/deploy.py llama-7B /path/to/llama-7b llama \
--tokenizer_path /path/to/tokenizer/model
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
```
......@@ -109,7 +109,7 @@ bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fast
<summary><b>13B</b></summary>
```shell
python3 lmdeploy/serve/fastertransformer/deploy.py llama-13B /path/to/llama-13b llama \
python3 lmdeploy/serve/turbomind/deploy.py llama-13B /path/to/llama-13b llama \
--tokenizer_path /path/to/tokenizer/model --tp 2
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
```
......@@ -128,7 +128,7 @@ python3 -m fastchat.model.apply_delta \
--target-model-path /path/to/vicuna-7b \
--delta-path lmsys/vicuna-7b-delta-v1.1
python3 lmdeploy/serve/fastertransformer/deploy.py vicuna-7B /path/to/vicuna-7b hf
python3 lmdeploy/serve/turbomind/deploy.py vicuna-7B /path/to/vicuna-7b hf
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
```
......@@ -144,7 +144,7 @@ python3 -m fastchat.model.apply_delta \
--target-model-path /path/to/vicuna-13b \
--delta-path lmsys/vicuna-13b-delta-v1.1
python3 lmdeploy/serve/fastertransformer/deploy.py vicuna-13B /path/to/vicuna-13b hf
python3 lmdeploy/serve/turbomind/deploy.py vicuna-13B /path/to/vicuna-13b hf
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
```
......
......@@ -96,7 +96,7 @@ make -j$(nproc) && make install
<summary><b>7B</b></summary>
```shell
python3 lmdeploy/serve/fastertransformer/deploy.py llama-7B /path/to/llama-7b llama \
python3 lmdeploy/serve/turbomind/deploy.py llama-7B /path/to/llama-7b llama \
--tokenizer_path /path/to/tokenizer/model
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
```
......@@ -107,7 +107,7 @@ bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fast
<summary><b>13B</b></summary>
```shell
python3 lmdeploy/serve/fastertransformer/deploy.py llama-13B /path/to/llama-13b llama \
python3 lmdeploy/serve/turbomind/deploy.py llama-13B /path/to/llama-13b llama \
--tokenizer_path /path/to/tokenizer/model --tp 2
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
```
......@@ -126,7 +126,7 @@ python3 -m fastchat.model.apply_delta \
--target-model-path /path/to/vicuna-7b \
--delta-path lmsys/vicuna-7b-delta-v1.1
python3 lmdeploy/serve/fastertransformer/deploy.py vicuna-7B /path/to/vicuna-7b hf
python3 lmdeploy/serve/turbomind/deploy.py vicuna-7B /path/to/vicuna-7b hf
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
```
......@@ -142,7 +142,7 @@ python3 -m fastchat.model.apply_delta \
--target-model-path /path/to/vicuna-13b \
--delta-path lmsys/vicuna-13b-delta-v1.1
python3 lmdeploy/serve/fastertransformer/deploy.py vicuna-13B /path/to/vicuna-13b hf
python3 lmdeploy/serve/turbomind/deploy.py vicuna-13B /path/to/vicuna-13b hf
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
```
......
......@@ -21,16 +21,16 @@
#include <memory>
#include <thread>
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h"
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
#include "src/fastertransformer/utils/custom_ar_comm.h"
#include "src/fastertransformer/utils/mpi_utils.h"
#include "src/fastertransformer/utils/nccl_utils.h"
#include "src/fastertransformer/utils/nvtx_utils.h"
#include "src/fastertransformer/utils/word_list.h"
namespace ft = fastertransformer;
#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
#include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
#include "src/turbomind/utils/custom_ar_comm.h"
#include "src/turbomind/utils/mpi_utils.h"
#include "src/turbomind/utils/nccl_utils.h"
#include "src/turbomind/utils/nvtx_utils.h"
#include "src/turbomind/utils/word_list.h"
namespace ft = turbomind;
constexpr const bool kUSE_MPI = true;
......
......@@ -27,7 +27,7 @@ if __name__ == '__main__':
author='OpenMMLab',
author_email='openmmlab@gmail.com',
packages=find_packages(
exclude=('lmdeploy/serve/fastertransformer/triton_models', )),
exclude=('lmdeploy/serve/turbomind/triton_models', )),
classifiers=[
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
......
......@@ -14,16 +14,16 @@
* limitations under the License.
*/
#include "src/fastertransformer/kernels/activation_kernels.h"
#include "src/fastertransformer/utils/cuda_type_utils.cuh"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/kernels/activation_kernels.h"
#include "src/turbomind/utils/cuda_type_utils.cuh"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h"
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#endif
namespace fastertransformer {
namespace turbomind {
/* Gelu Activation */
......@@ -255,8 +255,8 @@ void invokeGenericActivation(T* out,
const int seq_len,
cudaStream_t stream)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
FT_LOG_DEBUG("invokeGenericActivation %d %d %d", m, n, seq_len);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG("invokeGenericActivation %d %d %d", m, n, seq_len);
using PT = typename packed_type<T>::type;
constexpr int packed_elems = num_elems<PT>::value;
using PBT = typename packed_as<BT, packed_elems>::type;
......@@ -272,7 +272,7 @@ void invokeGenericActivation(T* out,
block.x = n_threads;
grid.x = ceil(m * n / double(n_threads));
}
FT_LOG_DEBUG("%d %d", grid.x, block.x);
TM_LOG_DEBUG("%d %d", grid.x, block.x);
sync_check_cuda_error();
generic_activation<Activation><<<grid, block, 0, stream>>>(reinterpret_cast<PT*>(out),
reinterpret_cast<const PBT*>(bias),
......@@ -655,4 +655,4 @@ void invokeSigmoid(T* data, const int size, const float scale, cudaStream_t stre
template void invokeSigmoid(float* data, const int size, const float scale, cudaStream_t stream);
template void invokeSigmoid(half* data, const int size, const float scale, cudaStream_t stream);
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,12 +16,12 @@
#pragma once
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <stdlib.h>
namespace fastertransformer {
namespace turbomind {
// clang-format off
template<typename T> struct GeluActivation;
......@@ -107,4 +107,4 @@ void invokeAddBiasTanh(T* out, const T* bias, const int m, const int n, cudaStre
template<typename T>
void invokeSigmoid(T* data, const int size, const float scale, cudaStream_t stream);
} // namespace fastertransformer
} // namespace turbomind
......@@ -14,10 +14,10 @@
* limitations under the License.
*/
#include "src/fastertransformer/kernels/ban_bad_words.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/kernels/ban_bad_words.h"
#include "src/turbomind/utils/cuda_utils.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
__global__ void ban_bad_words(T* logits,
......@@ -161,4 +161,4 @@ template void invokeBanBadWords(float* logits,
size_t step,
cudaStream_t stream);
} // namespace fastertransformer
} // namespace turbomind
......@@ -19,7 +19,7 @@
#include <cuda_fp16.h>
#include <cuda_runtime.h>
namespace fastertransformer {
namespace turbomind {
template<typename T>
void invokeBanBadWords(T* logits,
......@@ -36,4 +36,4 @@ void invokeBanBadWords(T* logits,
size_t step,
cudaStream_t stream);
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,10 +16,10 @@
#include <assert.h>
#include "src/fastertransformer/kernels/beam_search_penalty_kernels.h"
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/kernels/beam_search_penalty_kernels.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
namespace fastertransformer {
namespace turbomind {
template<typename T>
__global__ void add_bias_temperature(T* logits,
......@@ -310,4 +310,4 @@ template void invokeAddBiasApplyPenalties(int step,
const int min_length,
cudaStream_t stream);
} // namespace fastertransformer
} // namespace turbomind
......@@ -17,10 +17,10 @@
#include <cuda_fp16.h>
#include "src/fastertransformer/kernels/penalty_types.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/kernels/penalty_types.h"
#include "src/turbomind/utils/cuda_utils.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
void invokeAddBiasApplyPenalties(int step,
......@@ -45,4 +45,4 @@ void invokeAddBiasApplyPenalties(int step,
const int min_length,
cudaStream_t stream);
} // namespace fastertransformer
} // namespace turbomind
......@@ -22,13 +22,13 @@
#include "3rdparty/cub/cub.cuh"
#endif
#include "src/fastertransformer/kernels/beam_search_topk_kernels.h"
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/utils/cuda_type_utils.cuh"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/logger.h"
#include "src/turbomind/kernels/beam_search_topk_kernels.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/utils/cuda_type_utils.cuh"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/logger.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
__device__ __forceinline__ T apply_length_penalty(T log_prob, int length, float length_penalty)
......@@ -595,7 +595,7 @@ void invokeTopkBeamSearch(void* workspace,
const int* end_ids,
cudaStream_t stream)
{
FT_LOG_DEBUG("%s", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s", __PRETTY_FUNCTION__);
// log_probs: (batch, beam, vocab) cumulative log_probs of beams ending with a token.
const int vocab_size = vocab_size_padded_;
// Beam size should be less than or equal to vocab size.
......@@ -842,4 +842,4 @@ void invokeInsertUnfinishedPath(BeamHypotheses beam_hyps,
insertUnfinishedPath<<<batch_size, 256, 0, stream>>>(beam_hyps, finished, cum_log_probs, batch_size, beam_width);
}
} // namespace fastertransformer
} // namespace turbomind
......@@ -18,7 +18,7 @@
#pragma once
namespace fastertransformer {
namespace turbomind {
// In original beam search implementation, if a beam is finished, we set it as finished
// and only continue to do beam search on remain beams (namely, beam_width - 1 beams in next step)
......@@ -91,4 +91,4 @@ void invokeInsertUnfinishedPath(BeamHypotheses beam_hyps,
const int beam_width,
cudaStream_t stream);
} // namespace fastertransformer
} // namespace turbomind
......@@ -15,11 +15,11 @@
*/
#include "bert_preprocess_kernels.h"
#include "src/fastertransformer/utils/cuda_bf16_fallbacks.cuh"
#include "src/fastertransformer/utils/cuda_fp8_utils.h"
#include "src/fastertransformer/utils/cuda_type_utils.cuh"
#include "src/turbomind/utils/cuda_bf16_fallbacks.cuh"
#include "src/turbomind/utils/cuda_fp8_utils.h"
#include "src/turbomind/utils/cuda_type_utils.cuh"
namespace fastertransformer {
namespace turbomind {
__global__ void getPaddingOffsetAndCuSeqLensKernel(size_t* h_valid_word_num,
int* tmp_mask_offset,
......@@ -467,4 +467,4 @@ template void invokeQuantizeMatrixRebuildPadding<half, __nv_fp8_e4m3, QUANTIZE_M
#endif
} // namespace fastertransformer
} // namespace turbomind
......@@ -15,15 +15,15 @@
*/
#pragma once
#include "src/fastertransformer/kernels/gen_relative_pos_bias.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/kernels/gen_relative_pos_bias.h"
#include "src/turbomind/utils/cuda_utils.h"
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#ifdef ENABLE_FP8
#include "src/fastertransformer/utils/cuda_fp8_utils.h"
#include "src/turbomind/utils/cuda_fp8_utils.h"
#endif // ENABLE_FP8
namespace fastertransformer {
namespace turbomind {
void invokeGetPaddingOffsetAndCuSeqLens(size_t* h_pinned_token_num,
size_t* h_token_num,
......@@ -111,4 +111,4 @@ template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
void invokeQuantizeMatrixRebuildPadding(QuantizeMatrixRebuildPaddingParam<T_OUT, T_IN, quantize_mode> param);
#endif // ENABLE_FP8
} // namespace fastertransformer
} // namespace turbomind
......@@ -15,9 +15,9 @@
*/
#include "custom_ar_kernels.h"
#include "src/fastertransformer/utils/cuda_type_utils.cuh"
#include "src/turbomind/utils/cuda_type_utils.cuh"
namespace fastertransformer {
namespace turbomind {
////////////////////////////////////////////////////////////////////////////////////////////////////
......@@ -395,4 +395,4 @@ template void invokeOneOrTwoShotAllReduceKernel<__nv_bfloat16>(AllReduceParams<_
cudaStream_t stream);
#endif
template void invokeOneOrTwoShotAllReduceKernel<uint32_t>(AllReduceParams<uint32_t>& param, cudaStream_t stream);
} // namespace fastertransformer
} // namespace turbomind
......@@ -21,7 +21,7 @@
#include <iostream>
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/utils/cuda_utils.h"
#define CUSTOM_AR_SIZE_THRESHOLD 50331648
#define MAX_ALL_REDUCE_BLOCKS 24
......@@ -31,7 +31,7 @@
#define DEFAULT_BLOCK_SIZE 1024
#define DEFALUT_ALGO_AR_SIZE_THRESHOLD 393216
namespace fastertransformer {
namespace turbomind {
#ifdef ENABLE_BF16
typedef struct bf168 {
......@@ -60,4 +60,4 @@ void invokeOneOrTwoShotAllReduceKernel(AllReduceParams<T>& param, cudaStream_t s
void kernelLaunchConfig(int& blocks_per_grid, int& threads_per_block, size_t elts, int kernel_algo);
} // namespace fastertransformer
} // namespace turbomind
......@@ -14,10 +14,10 @@
* limitations under the License.
*/
#include "src/fastertransformer/kernels/decoder_masked_multihead_attention.h"
#include "src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh"
#include "src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/kernels/decoder_masked_multihead_attention.h"
#include "src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh"
#include "src/turbomind/kernels/decoder_masked_multihead_attention_utils.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include <assert.h>
#include <float.h>
#include <type_traits>
......
......@@ -16,9 +16,9 @@
#pragma once
#include "src/fastertransformer/layers/attention_layers_fp8/AttentionFP8Weight.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_fp8_utils.h"
#include "src/turbomind/layers/attention_layers_fp8/AttentionFP8Weight.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/cuda_fp8_utils.h"
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <stdint.h>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment