[Fix] Remove unused code to reduce binary size (#181)

* clean-up * fix lint * fix lint

[Fix] Remove unused code to reduce binary size (#181)
* clean-up * fix lint * fix lint
981a4610 · Li Zhang · GitHub · 83697422 · 83697422 · 83697422
Unverified Commit 981a4610 authored Jul 31, 2023 by Li Zhang Committed by GitHub Jul 31, 2023
20 changed files
--- a/src/turbomind/layers/FfnFP8Layer.cc
+++ b/src/turbomind/layers/FfnFP8Layer.cc
--- a/src/turbomind/layers/FfnFP8Layer.h
+++ b/src/turbomind/layers/FfnFP8Layer.h
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "src/turbomind/layers/BaseLayer.h"
-#include "src/turbomind/layers/FfnFP8Weight.h"
-#include "src/turbomind/layers/FfnLayer.h"
-#include "src/turbomind/utils/memory_utils.h"
-#include <vector>
-
-namespace turbomind {
-
-template<typename T1, typename T2>
-class FfnFP8Layer: public BaseLayer {
-private:
-    void allocateBuffer() override;
-    void freeBuffer() override;
-    void allocateBuffer(size_t token_num);
-
-protected:
-    const int    fp8_mode_;
-    T1*          inter_buf_      = nullptr;
-    T2*          inter_buf_bf16_ = nullptr;
-    size_t       inter_size_;
-    virtual void invokeAddBiasActivation(const int    m,
-                                         const T2*    bias,
-                                         const float* input_scale,
-                                         const float* input_scale_2,
-                                         const float* input_scale_2_min,
-                                         const float* output_scale) = 0;
-
-public:
-    FfnFP8Layer(size_t           inter_size,
-                int              fp8_mode,
-                cudaStream_t     stream,
-                cublasMMWrapper* cublas_wrapper,
-                IAllocator*      allocator,
-                bool             is_free_buffer_after_forward,
-                bool             sparse = false);
-
-    FfnFP8Layer(FfnFP8Layer<T1, T2> const& ffn_layer);
-
-    virtual ~FfnFP8Layer();
-
-    virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors, const FfnFP8Weight<T1, T2>* ffn_weights);
-    virtual ActivationType getActivationType() = 0;
-};
-
-template<typename T1, typename T2>
-class GeluFfnFP8Layer: public FfnFP8Layer<T1, T2> {
-public:
-    GeluFfnFP8Layer(size_t           inter_size,
-                    int              fp8_mode_,
-                    cudaStream_t     stream,
-                    cublasMMWrapper* cublas_wrapper,
-                    IAllocator*      allocator,
-                    bool             is_free_buffer_after_forward,
-                    bool             sparse = false);
-
-    GeluFfnFP8Layer(GeluFfnFP8Layer<T1, T2> const& ffn_layer);
-
-    virtual ~GeluFfnFP8Layer() = default;
-    ActivationType getActivationType() override
-    {
-        return ActivationType::Gelu;
-    };
-
-protected:
-    using FfnFP8Layer<T1, T2>::stream_;
-
-private:
-    using FfnFP8Layer<T1, T2>::inter_buf_;
-    using FfnFP8Layer<T1, T2>::inter_size_;
-    using FfnFP8Layer<T1, T2>::fp8_mode_;
-    using FfnFP8Layer<T1, T2>::inter_buf_bf16_;
-    void invokeAddBiasActivation(const int    m,
-                                 const T2*    bias,
-                                 const float* input_scale,
-                                 const float* input_scale_2,
-                                 const float* input_scale_2_min,
-                                 const float* output_scale) override;
-};
-
-template<typename T1, typename T2>
-class ReluFfnFP8Layer: public FfnFP8Layer<T1, T2> {
-public:
-    ReluFfnFP8Layer(size_t           inter_size,
-                    int              fp8_mode,
-                    cudaStream_t     stream,
-                    cublasMMWrapper* cublas_wrapper,
-                    IAllocator*      allocator,
-                    bool             is_free_buffer_after_forward,
-                    bool             sparse = false);
-
-    ReluFfnFP8Layer(ReluFfnFP8Layer<T1, T2> const& ffn_layer);
-
-    virtual ~ReluFfnFP8Layer() = default;
-    ActivationType getActivationType() override
-    {
-        return ActivationType::Relu;
-    };
-
-protected:
-    using FfnFP8Layer<T1, T2>::stream_;
-
-private:
-    using FfnFP8Layer<T1, T2>::inter_buf_;
-    using FfnFP8Layer<T1, T2>::inter_size_;
-    using FfnFP8Layer<T1, T2>::fp8_mode_;
-    using FfnFP8Layer<T1, T2>::inter_buf_bf16_;
-    void invokeAddBiasActivation(const int    m,
-                                 const T2*    bias,
-                                 const float* input_scale,
-                                 const float* input_scale_2,
-                                 const float* input_scale_2_min,
-                                 const float* output_scale) override;
-};
-
-}  // namespace turbomind
--- a/src/turbomind/layers/FfnFP8Weight.h
+++ b/src/turbomind/layers/FfnFP8Weight.h
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "FfnWeight.h"
-#include "src/turbomind/utils/ScaleList.h"
-namespace turbomind {
-
-template<typename T1, typename T2>
-struct FfnFP8Weight: FfnWeight<T1, T2> {
-    ScaleList* scale_list_ptr;
-    float*     identity_scale;
-    float*     identity_h_scale;
-};
-
-}  // namespace turbomind
--- a/src/turbomind/layers/FfnINT8Weight.h
+++ b/src/turbomind/layers/FfnINT8Weight.h
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "FfnWeight.h"
-#include "src/turbomind/utils/ScaleList.h"
-namespace turbomind {
-
-template<typename T>
-struct FfnINT8Weight: FfnWeight<T> {
-    ScaleList* scale_list_ptr;
-};
-
-}  // namespace turbomind
--- a/src/turbomind/layers/FfnLayerINT8.cc
+++ b/src/turbomind/layers/FfnLayerINT8.cc
--- a/src/turbomind/layers/FfnLayerINT8.h
+++ b/src/turbomind/layers/FfnLayerINT8.h
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "FfnINT8Weight.h"
-#include "src/turbomind/kernels/activation_int8_kernels.h"
-#include "src/turbomind/layers/BaseLayer.h"
-#include "src/turbomind/utils/ScaleList.h"
-#include "src/turbomind/utils/Tensor.h"
-#include "src/turbomind/utils/allocator.h"
-#include "src/turbomind/utils/cublasINT8MMWrapper.h"
-#include "src/turbomind/utils/memory_utils.h"
-#include <vector>
-
-namespace turbomind {
-
-template<typename T>
-class GeluFfnLayerINT8;
-
-template<typename T>
-class ReluFfnLayerINT8;
-
-template<typename T>
-class FfnLayerINT8: public BaseLayer {
-private:
-    // buffer handling
-    size_t max_token_num_ = 0;
-
-    // meta data
-    size_t head_num_;
-    size_t size_per_head_;
-
-    // calculated data
-    size_t hidden_units_;
-
-    void allocateBuffer() override;
-    void freeBuffer() override;
-    bool isValidTokenNum(size_t token_num);
-
-protected:
-    size_t inter_size_;
-    int    int8_mode_;
-    bool   sparse_;
-
-    int*         inter_int_buf_;
-    int8_t*      inter_buf_;
-    virtual void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) = 0;
-
-public:
-    FfnLayerINT8(size_t           max_batch_size,
-                 size_t           max_seq_len,
-                 size_t           head_num,
-                 size_t           size_per_head,
-                 size_t           inter_size,
-                 int              int8_mode,
-                 cudaStream_t     stream,
-                 cublasMMWrapper* cublas_wrapper,
-                 IAllocator*      allocator,
-                 bool             is_free_buffer_after_forward,
-                 bool             sparse = false);
-
-    FfnLayerINT8(FfnLayerINT8<T> const& ffn_layer);
-
-    ~FfnLayerINT8();
-
-    void forward(std::vector<turbomind::Tensor>*       output_tensors,
-                 const std::vector<turbomind::Tensor>* input_tensors,
-                 const FfnWeight<T>*                   ffn_weights);
-
-    friend GeluFfnLayerINT8<T>;
-    friend ReluFfnLayerINT8<T>;
-};
-
-template<typename T>
-class GeluFfnLayerINT8: public FfnLayerINT8<T> {
-public:
-    GeluFfnLayerINT8(size_t           max_batch_size,
-                     size_t           max_seq_len,
-                     size_t           head_num,
-                     size_t           size_per_head,
-                     size_t           inter_size,
-                     int              int8_mode,
-                     cudaStream_t     stream,
-                     cublasMMWrapper* cublas_wrapper,
-                     IAllocator*      allocator,
-                     bool             is_free_buffer_after_forward,
-                     bool             sparse = false);
-
-    GeluFfnLayerINT8(GeluFfnLayerINT8<T> const& ffn_layer);
-
-    ~GeluFfnLayerINT8() = default;
-
-private:
-    using FfnLayerINT8<T>::inter_int_buf_;
-    using FfnLayerINT8<T>::inter_buf_;
-    using FfnLayerINT8<T>::inter_size_;
-    using FfnLayerINT8<T>::stream_;
-    using FfnLayerINT8<T>::int8_mode_;
-    using FfnLayerINT8<T>::sparse_;
-    using FfnLayerINT8<T>::hidden_units_;
-    void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override;
-};
-
-template<typename T>
-class ReluFfnLayerINT8: public FfnLayerINT8<T> {
-public:
-    ReluFfnLayerINT8(size_t           max_batch_size,
-                     size_t           max_seq_len,
-                     size_t           head_num,
-                     size_t           size_per_head,
-                     size_t           inter_size,
-                     int              int8_mode,
-                     cudaStream_t     stream,
-                     cublasMMWrapper* cublas_wrapper,
-                     IAllocator*      allocator,
-                     bool             is_free_buffer_after_forward);
-
-    ReluFfnLayerINT8(ReluFfnLayerINT8<T> const& ffn_layer);
-
-    ~ReluFfnLayerINT8() = default;
-
-private:
-    using FfnLayerINT8<T>::inter_int_buf_;
-    using FfnLayerINT8<T>::inter_buf_;
-    using FfnLayerINT8<T>::inter_size_;
-    using FfnLayerINT8<T>::stream_;
-    using FfnLayerINT8<T>::int8_mode_;
-    using FfnLayerINT8<T>::hidden_units_;
-    void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override;
-};
-
-}  // namespace turbomind
--- a/src/turbomind/layers/attention_layers_fp8/AttentionFP8Weight.h
+++ b/src/turbomind/layers/attention_layers_fp8/AttentionFP8Weight.h
--- a/src/turbomind/layers/attention_layers_fp8/BaseAttentionFP8Layer.h
+++ b/src/turbomind/layers/attention_layers_fp8/BaseAttentionFP8Layer.h
--- a/src/turbomind/layers/attention_layers_fp8/CMakeLists.txt
+++ b/src/turbomind/layers/attention_layers_fp8/CMakeLists.txt
--- a/src/turbomind/layers/attention_layers_int8/AttentionINT8Weight.h
+++ b/src/turbomind/layers/attention_layers_int8/AttentionINT8Weight.h
--- a/src/turbomind/layers/attention_layers_int8/CMakeLists.txt
+++ b/src/turbomind/layers/attention_layers_int8/CMakeLists.txt
--- a/src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.cu
+++ b/src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.cu
--- a/src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h
+++ b/src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h
--- a/src/turbomind/layers/beam_search_layers/BeamSearchLayer.cu
+++ b/src/turbomind/layers/beam_search_layers/BeamSearchLayer.cu
--- a/src/turbomind/layers/beam_search_layers/BeamSearchLayer.h
+++ b/src/turbomind/layers/beam_search_layers/BeamSearchLayer.h
--- a/src/turbomind/layers/beam_search_layers/CMakeLists.txt
+++ b/src/turbomind/layers/beam_search_layers/CMakeLists.txt
--- a/src/turbomind/layers/beam_search_layers/OnlineBeamSearchLayer.cu
+++ b/src/turbomind/layers/beam_search_layers/OnlineBeamSearchLayer.cu
--- a/src/turbomind/layers/beam_search_layers/OnlineBeamSearchLayer.h
+++ b/src/turbomind/layers/beam_search_layers/OnlineBeamSearchLayer.h
--- a/src/turbomind/models/llama/CMakeLists.txt
+++ b/src/turbomind/models/llama/CMakeLists.txt
@@ -23,7 +23,6 @@ set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 target_link_libraries(Llama PUBLIC -lcudart
        cublasMMWrapper
        DynamicDecodeLayer
-        BaseBeamSearchLayer
        activation_kernels
        decoder_masked_multihead_attention
        bert_preprocess_kernels

--- a/src/turbomind/models/llama/prefix_cache.cu
+++ b/src/turbomind/models/llama/prefix_cache.cu