Unverified Commit 981a4610 authored by Li Zhang's avatar Li Zhang Committed by GitHub
Browse files

[Fix] Remove unused code to reduce binary size (#181)

* clean-up

* fix lint

* fix lint
parent 83697422
This diff is collapsed.
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/turbomind/layers/BaseLayer.h"
#include "src/turbomind/layers/FfnFP8Weight.h"
#include "src/turbomind/layers/FfnLayer.h"
#include "src/turbomind/utils/memory_utils.h"
#include <vector>
namespace turbomind {
template<typename T1, typename T2>
class FfnFP8Layer: public BaseLayer {
private:
void allocateBuffer() override;
void freeBuffer() override;
void allocateBuffer(size_t token_num);
protected:
const int fp8_mode_;
T1* inter_buf_ = nullptr;
T2* inter_buf_bf16_ = nullptr;
size_t inter_size_;
virtual void invokeAddBiasActivation(const int m,
const T2* bias,
const float* input_scale,
const float* input_scale_2,
const float* input_scale_2_min,
const float* output_scale) = 0;
public:
FfnFP8Layer(size_t inter_size,
int fp8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false);
FfnFP8Layer(FfnFP8Layer<T1, T2> const& ffn_layer);
virtual ~FfnFP8Layer();
virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors, const FfnFP8Weight<T1, T2>* ffn_weights);
virtual ActivationType getActivationType() = 0;
};
template<typename T1, typename T2>
class GeluFfnFP8Layer: public FfnFP8Layer<T1, T2> {
public:
GeluFfnFP8Layer(size_t inter_size,
int fp8_mode_,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false);
GeluFfnFP8Layer(GeluFfnFP8Layer<T1, T2> const& ffn_layer);
virtual ~GeluFfnFP8Layer() = default;
ActivationType getActivationType() override
{
return ActivationType::Gelu;
};
protected:
using FfnFP8Layer<T1, T2>::stream_;
private:
using FfnFP8Layer<T1, T2>::inter_buf_;
using FfnFP8Layer<T1, T2>::inter_size_;
using FfnFP8Layer<T1, T2>::fp8_mode_;
using FfnFP8Layer<T1, T2>::inter_buf_bf16_;
void invokeAddBiasActivation(const int m,
const T2* bias,
const float* input_scale,
const float* input_scale_2,
const float* input_scale_2_min,
const float* output_scale) override;
};
template<typename T1, typename T2>
class ReluFfnFP8Layer: public FfnFP8Layer<T1, T2> {
public:
ReluFfnFP8Layer(size_t inter_size,
int fp8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false);
ReluFfnFP8Layer(ReluFfnFP8Layer<T1, T2> const& ffn_layer);
virtual ~ReluFfnFP8Layer() = default;
ActivationType getActivationType() override
{
return ActivationType::Relu;
};
protected:
using FfnFP8Layer<T1, T2>::stream_;
private:
using FfnFP8Layer<T1, T2>::inter_buf_;
using FfnFP8Layer<T1, T2>::inter_size_;
using FfnFP8Layer<T1, T2>::fp8_mode_;
using FfnFP8Layer<T1, T2>::inter_buf_bf16_;
void invokeAddBiasActivation(const int m,
const T2* bias,
const float* input_scale,
const float* input_scale_2,
const float* input_scale_2_min,
const float* output_scale) override;
};
} // namespace turbomind
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "FfnWeight.h"
#include "src/turbomind/utils/ScaleList.h"
namespace turbomind {
template<typename T1, typename T2>
struct FfnFP8Weight: FfnWeight<T1, T2> {
ScaleList* scale_list_ptr;
float* identity_scale;
float* identity_h_scale;
};
} // namespace turbomind
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "FfnWeight.h"
#include "src/turbomind/utils/ScaleList.h"
namespace turbomind {
template<typename T>
struct FfnINT8Weight: FfnWeight<T> {
ScaleList* scale_list_ptr;
};
} // namespace turbomind
This diff is collapsed.
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "FfnINT8Weight.h"
#include "src/turbomind/kernels/activation_int8_kernels.h"
#include "src/turbomind/layers/BaseLayer.h"
#include "src/turbomind/utils/ScaleList.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cublasINT8MMWrapper.h"
#include "src/turbomind/utils/memory_utils.h"
#include <vector>
namespace turbomind {
template<typename T>
class GeluFfnLayerINT8;
template<typename T>
class ReluFfnLayerINT8;
template<typename T>
class FfnLayerINT8: public BaseLayer {
private:
// buffer handling
size_t max_token_num_ = 0;
// meta data
size_t head_num_;
size_t size_per_head_;
// calculated data
size_t hidden_units_;
void allocateBuffer() override;
void freeBuffer() override;
bool isValidTokenNum(size_t token_num);
protected:
size_t inter_size_;
int int8_mode_;
bool sparse_;
int* inter_int_buf_;
int8_t* inter_buf_;
virtual void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) = 0;
public:
FfnLayerINT8(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t inter_size,
int int8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false);
FfnLayerINT8(FfnLayerINT8<T> const& ffn_layer);
~FfnLayerINT8();
void forward(std::vector<turbomind::Tensor>* output_tensors,
const std::vector<turbomind::Tensor>* input_tensors,
const FfnWeight<T>* ffn_weights);
friend GeluFfnLayerINT8<T>;
friend ReluFfnLayerINT8<T>;
};
template<typename T>
class GeluFfnLayerINT8: public FfnLayerINT8<T> {
public:
GeluFfnLayerINT8(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t inter_size,
int int8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false);
GeluFfnLayerINT8(GeluFfnLayerINT8<T> const& ffn_layer);
~GeluFfnLayerINT8() = default;
private:
using FfnLayerINT8<T>::inter_int_buf_;
using FfnLayerINT8<T>::inter_buf_;
using FfnLayerINT8<T>::inter_size_;
using FfnLayerINT8<T>::stream_;
using FfnLayerINT8<T>::int8_mode_;
using FfnLayerINT8<T>::sparse_;
using FfnLayerINT8<T>::hidden_units_;
void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override;
};
template<typename T>
class ReluFfnLayerINT8: public FfnLayerINT8<T> {
public:
ReluFfnLayerINT8(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t inter_size,
int int8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward);
ReluFfnLayerINT8(ReluFfnLayerINT8<T> const& ffn_layer);
~ReluFfnLayerINT8() = default;
private:
using FfnLayerINT8<T>::inter_int_buf_;
using FfnLayerINT8<T>::inter_buf_;
using FfnLayerINT8<T>::inter_size_;
using FfnLayerINT8<T>::stream_;
using FfnLayerINT8<T>::int8_mode_;
using FfnLayerINT8<T>::hidden_units_;
void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override;
};
} // namespace turbomind
This diff is collapsed.
......@@ -23,7 +23,6 @@ set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(Llama PUBLIC -lcudart
cublasMMWrapper
DynamicDecodeLayer
BaseBeamSearchLayer
activation_kernels
decoder_masked_multihead_attention
bert_preprocess_kernels
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment