"...git@developer.sourcefind.cn:OpenDAS/llama-factory.git" did not exist on "37b0ad9ffc8cf167b47da1a6f8898bfa5dec999a"
Unverified Commit fe46dac2 authored by AllentDan's avatar AllentDan Committed by GitHub
Browse files

Add lint action (#32)

* temp

* fix lint

* csrc->src

* remove clang-format

* skip .rst

* skip doc

* clang-format

version

version

* mat_B
parent e8ab4ba3
...@@ -40,7 +40,7 @@ struct LlamaWeight { ...@@ -40,7 +40,7 @@ struct LlamaWeight {
~LlamaWeight(); ~LlamaWeight();
LlamaWeight(const LlamaWeight& other) = delete; LlamaWeight(const LlamaWeight& other) = delete;
LlamaWeight& operator=(const LlamaWeight& other) = delete; LlamaWeight& operator=(const LlamaWeight& other) = delete;
void loadModel(std::string dir_path); void loadModel(std::string dir_path);
...@@ -67,4 +67,4 @@ private: ...@@ -67,4 +67,4 @@ private:
size_t tensor_para_rank_; size_t tensor_para_rank_;
}; };
} // namespace fastertransformer } // namespace fastertransformer
\ No newline at end of file
...@@ -25,8 +25,7 @@ struct Request { ...@@ -25,8 +25,7 @@ struct Request {
using Callback = std::function<void(std::unordered_map<std::string, Tensor>*)>; using Callback = std::function<void(std::unordered_map<std::string, Tensor>*)>;
Callback stream_cb; Callback stream_cb;
enum enum {
{
kInvalid = 1, kInvalid = 1,
kConflict = 2, kConflict = 2,
kBusy = 3, kBusy = 3,
...@@ -88,4 +87,4 @@ private: ...@@ -88,4 +87,4 @@ private:
std::condition_variable cv_; std::condition_variable cv_;
}; };
} // namespace fastertransformer } // namespace fastertransformer
\ No newline at end of file
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
#include <cutlass/half.h> #include <cutlass/half.h>
#include <cutlass/platform/platform.h> #include <cutlass/platform/platform.h>
// modifiy from: // modified from:
// https://github.com/NVIDIA/cutlass/blob/main/examples/41_fused_multi_head_attention/kernel_forward.h // https://github.com/NVIDIA/cutlass/blob/main/examples/41_fused_multi_head_attention/kernel_forward.h
namespace fastertransformer { namespace fastertransformer {
......
...@@ -11,7 +11,8 @@ namespace cg = cooperative_groups; ...@@ -11,7 +11,8 @@ namespace cg = cooperative_groups;
namespace fastertransformer { namespace fastertransformer {
template<typename T> template<typename T>
struct res_norm_ops_t {}; struct res_norm_ops_t {
};
template<typename T> template<typename T>
struct res_norm_t { struct res_norm_t {
...@@ -144,7 +145,7 @@ __global__ void fusedAddBiasResidualNorm(T* __restrict__ r_data, ...@@ -144,7 +145,7 @@ __global__ void fusedAddBiasResidualNorm(T* __restrict__ r_data,
template<typename T> template<typename T>
void invokeFusedAddBiasResidualRMSNorm( void invokeFusedAddBiasResidualRMSNorm(
T* residual, T* inout, const T* bias, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream) T* residual, T* in_out, const T* bias, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream)
{ {
constexpr int PACK_DIM = sizeof(uint4) / sizeof(T); constexpr int PACK_DIM = sizeof(uint4) / sizeof(T);
FT_CHECK(n_dims % PACK_DIM == 0); FT_CHECK(n_dims % PACK_DIM == 0);
...@@ -154,7 +155,7 @@ void invokeFusedAddBiasResidualRMSNorm( ...@@ -154,7 +155,7 @@ void invokeFusedAddBiasResidualRMSNorm(
n_threads = (n_threads + 31) / 32 * 32; // round up to the nearest multiple of warp size n_threads = (n_threads + 31) / 32 * 32; // round up to the nearest multiple of warp size
fusedAddBiasResidualNorm<<<batch_size, n_threads, 0, stream>>>( fusedAddBiasResidualNorm<<<batch_size, n_threads, 0, stream>>>(
residual, inout, bias, scale, eps, batch_size, n_dims); residual, in_out, bias, scale, eps, batch_size, n_dims);
} }
template void template void
......
...@@ -6,6 +6,6 @@ namespace fastertransformer { ...@@ -6,6 +6,6 @@ namespace fastertransformer {
template<typename T> template<typename T>
void invokeFusedAddBiasResidualRMSNorm( void invokeFusedAddBiasResidualRMSNorm(
T* residual, T* inout, const T* bias, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream); T* residual, T* in_out, const T* bias, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream);
} // namespace fastertransformer } // namespace fastertransformer
// Copyright (c) OpenMMLab. All rights reserved. // Copyright (c) OpenMMLab. All rights reserved.
#include "src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h"
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh" #include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/models/llama/llama_kernels.h" #include "src/fastertransformer/models/llama/llama_kernels.h"
#include "src/fastertransformer/models/llama/llama_utils.h" #include "src/fastertransformer/models/llama/llama_utils.h"
#include "src/fastertransformer/utils/cuda_type_utils.cuh" #include "src/fastertransformer/utils/cuda_type_utils.cuh"
#include "src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h"
namespace fastertransformer { namespace fastertransformer {
...@@ -293,37 +293,40 @@ inline __device__ float2 float2div(float a, float2 b) ...@@ -293,37 +293,40 @@ inline __device__ float2 float2div(float a, float2 b)
return c; return c;
} }
static inline __device__ half4 char4_scale_to_half4(char4 value, const float scale) { static inline __device__ half4 char4_scale_to_half4(char4 value, const float scale)
half4 dst; {
dst.x = __float2half(value.x * scale); half4 dst;
dst.y = __float2half(value.y * scale); dst.x = __float2half(value.x * scale);
dst.z = __float2half(value.z * scale); dst.y = __float2half(value.y * scale);
dst.w = __float2half(value.w * scale); dst.z = __float2half(value.z * scale);
return dst; dst.w = __float2half(value.w * scale);
return dst;
} }
static inline __device__ uint32_t float4_to_char4(float x, static inline __device__ uint32_t float4_to_char4(float x, float y, float z, float w)
float y, {
float z, uint32_t dst;
float w) {
uint32_t dst;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 720 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 720
uint32_t a; asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(a) : "f"(x)); uint32_t a;
uint32_t b; asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(b) : "f"(y)); asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(a) : "f"(x));
uint32_t c; asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(c) : "f"(z)); uint32_t b;
uint32_t d; asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(d) : "f"(w)); asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(b) : "f"(y));
uint32_t c;
asm volatile("cvt.pack.sat.s8.s32.b32 %0, %1, %2, 0;\n" : "=r"(dst) : "r"(d), "r"(c)); asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(c) : "f"(z));
asm volatile("cvt.pack.sat.s8.s32.b32 %0, %1, %2, %0;\n" : "+r"(dst) : "r"(b), "r"(a)); uint32_t d;
asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(d) : "f"(w));
asm volatile("cvt.pack.sat.s8.s32.b32 %0, %1, %2, 0;\n" : "=r"(dst) : "r"(d), "r"(c));
asm volatile("cvt.pack.sat.s8.s32.b32 %0, %1, %2, %0;\n" : "+r"(dst) : "r"(b), "r"(a));
#else #else
char4 tmp; char4 tmp;
tmp.x = x; tmp.x = x;
tmp.y = y; tmp.y = y;
tmp.z = z; tmp.z = z;
tmp.w = w; tmp.w = w;
dst = reinterpret_cast<const uint32_t&>(tmp); dst = reinterpret_cast<const uint32_t&>(tmp);
#endif #endif
return dst; return dst;
} }
template<typename T> template<typename T>
...@@ -380,7 +383,6 @@ __global__ void extend_value_cache_int8(int8_t** v_dst, ...@@ -380,7 +383,6 @@ __global__ void extend_value_cache_int8(int8_t** v_dst,
} }
} }
template<typename T> template<typename T>
void invokeExtendKVCache(T** k_dst, void invokeExtendKVCache(T** k_dst,
T** v_dst, T** v_dst,
...@@ -404,18 +406,48 @@ void invokeExtendKVCache(T** k_dst, ...@@ -404,18 +406,48 @@ void invokeExtendKVCache(T** k_dst,
dim3 grid((max_q_len * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num); dim3 grid((max_q_len * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num);
if (quant & QuantPolicy::kCacheKVInt8) { if (quant & QuantPolicy::kCacheKVInt8) {
extend_value_cache_int8<<<grid, block_sz, 0, stream>>>( extend_value_cache_int8<<<grid, block_sz, 0, stream>>>(reinterpret_cast<int8_t**>(k_dst),
reinterpret_cast<int8_t**>(k_dst), dst_offset, k_src, local_head_num, size_per_head, query_length, history_length, max_q_len, max_seq_len, kv_scale[0]); dst_offset,
k_src,
extend_value_cache_int8<<<grid, block_sz, 0, stream>>>( local_head_num,
reinterpret_cast<int8_t**>(v_dst), dst_offset, v_src, local_head_num, size_per_head, query_length, history_length, max_q_len, max_seq_len, kv_scale[1]); size_per_head,
query_length,
} else { history_length,
extend_value_cache<<<grid, block_sz, 0, stream>>>( max_q_len,
k_dst, dst_offset, k_src, local_head_num, size_per_head, query_length, history_length, max_q_len, max_seq_len); max_seq_len,
kv_scale[0]);
extend_value_cache<<<grid, block_sz, 0, stream>>>(
v_dst, dst_offset, v_src, local_head_num, size_per_head, query_length, history_length, max_q_len, max_seq_len); extend_value_cache_int8<<<grid, block_sz, 0, stream>>>(reinterpret_cast<int8_t**>(v_dst),
dst_offset,
v_src,
local_head_num,
size_per_head,
query_length,
history_length,
max_q_len,
max_seq_len,
kv_scale[1]);
}
else {
extend_value_cache<<<grid, block_sz, 0, stream>>>(k_dst,
dst_offset,
k_src,
local_head_num,
size_per_head,
query_length,
history_length,
max_q_len,
max_seq_len);
extend_value_cache<<<grid, block_sz, 0, stream>>>(v_dst,
dst_offset,
v_src,
local_head_num,
size_per_head,
query_length,
history_length,
max_q_len,
max_seq_len);
} }
} }
...@@ -492,17 +524,16 @@ __global__ void transpose_value_cache(T* v_dst, // ...@@ -492,17 +524,16 @@ __global__ void transpose_value_cache(T* v_dst, //
} }
} }
template<typename T> template<typename T>
__global__ void transpose_value_cache_int8(T* v_dst, // __global__ void transpose_value_cache_int8(T* v_dst, //
const int8_t** v_src, const int8_t** v_src,
const size_t src_offset, const size_t src_offset,
const int head_num, const int head_num,
const int size_per_head, const int size_per_head,
const int* seq_length, const int* seq_length,
const int max_kv_len, const int max_kv_len,
const int max_seq_len, const int max_seq_len,
const float v_scale) const float v_scale)
{ {
const int batch_id = blockIdx.y; const int batch_id = blockIdx.y;
const int head_id = blockIdx.z; const int head_id = blockIdx.z;
...@@ -533,7 +564,7 @@ __global__ void transpose_value_cache_int8(T* v_dst, // ...@@ -533,7 +564,7 @@ __global__ void transpose_value_cache_int8(T* v_dst, //
// int8x8 -> fp16x8 // int8x8 -> fp16x8
const auto from_ptr = reinterpret_cast<const char4*>(val_src + src_idx); const auto from_ptr = reinterpret_cast<const char4*>(val_src + src_idx);
auto to_ptr = reinterpret_cast<half4*>(val_dst + dst_idx); auto to_ptr = reinterpret_cast<half4*>(val_dst + dst_idx);
to_ptr[0] = char4_scale_to_half4(from_ptr[0], v_scale); to_ptr[0] = char4_scale_to_half4(from_ptr[0], v_scale);
to_ptr[1] = char4_scale_to_half4(from_ptr[1], v_scale); to_ptr[1] = char4_scale_to_half4(from_ptr[1], v_scale);
...@@ -562,13 +593,27 @@ void invokeTransposeKVCache(T* key_cache_trans, ...@@ -562,13 +593,27 @@ void invokeTransposeKVCache(T* key_cache_trans,
dim3 grid((max_kv_len * size_per_head / x + block_sz - 1) / block_sz, batch_size, head_num); dim3 grid((max_kv_len * size_per_head / x + block_sz - 1) / block_sz, batch_size, head_num);
if (quant & QuantPolicy::kCacheKVInt8) { if (quant & QuantPolicy::kCacheKVInt8) {
transpose_value_cache_int8<<<grid, block_sz, 0, stream>>>( transpose_value_cache_int8<<<grid, block_sz, 0, stream>>>(key_cache_trans,
key_cache_trans, reinterpret_cast<const int8_t**>(key_cache), src_offset, head_num, size_per_head, key_length, max_kv_len, max_seq_len, kv_scale[0]); reinterpret_cast<const int8_t**>(key_cache),
src_offset,
transpose_value_cache_int8<<<grid, block_sz, 0, stream>>>( head_num,
val_cache_trans, reinterpret_cast<const int8_t**>(val_cache), src_offset, head_num, size_per_head, key_length, max_kv_len, max_seq_len, kv_scale[1]); size_per_head,
key_length,
} else { max_kv_len,
max_seq_len,
kv_scale[0]);
transpose_value_cache_int8<<<grid, block_sz, 0, stream>>>(val_cache_trans,
reinterpret_cast<const int8_t**>(val_cache),
src_offset,
head_num,
size_per_head,
key_length,
max_kv_len,
max_seq_len,
kv_scale[1]);
}
else {
transpose_value_cache<<<grid, block_sz, 0, stream>>>( transpose_value_cache<<<grid, block_sz, 0, stream>>>(
key_cache_trans, key_cache, src_offset, head_num, size_per_head, key_length, max_kv_len, max_seq_len); key_cache_trans, key_cache, src_offset, head_num, size_per_head, key_length, max_kv_len, max_seq_len);
...@@ -577,10 +622,34 @@ void invokeTransposeKVCache(T* key_cache_trans, ...@@ -577,10 +622,34 @@ void invokeTransposeKVCache(T* key_cache_trans,
} }
} }
template void invokeTransposeKVCache( template void invokeTransposeKVCache(float*,
float*, float*, const float**, const float**, size_t, int, const int*, int, int, int, int, cudaStream_t stream, int, const float*); float*,
template void invokeTransposeKVCache( const float**,
half*, half*, const half**, const half**, size_t, int, const int*, int, int, int, int, cudaStream_t stream, int, const float*); const float**,
size_t,
int,
const int*,
int,
int,
int,
int,
cudaStream_t stream,
int,
const float*);
template void invokeTransposeKVCache(half*,
half*,
const half**,
const half**,
size_t,
int,
const int*,
int,
int,
int,
int,
cudaStream_t stream,
int,
const float*);
__global__ void gatherOutput(int* output_ids, __global__ void gatherOutput(int* output_ids,
const int* ids, const int* ids,
...@@ -619,4 +688,4 @@ void invokeGatherOutput(int* output_ids, ...@@ -619,4 +688,4 @@ void invokeGatherOutput(int* output_ids,
output_ids, ids, context_length, max_context_len, max_gen_step, max_output_len, batch_size); output_ids, ids, context_length, max_context_len, max_gen_step, max_output_len, batch_size);
} }
} // namespace fastertransformer } // namespace fastertransformer
\ No newline at end of file
...@@ -163,4 +163,4 @@ inline void dump_sequence_len(int* d_seq_len, int step, int tp_rank, cudaStream_ ...@@ -163,4 +163,4 @@ inline void dump_sequence_len(int* d_seq_len, int step, int tp_rank, cudaStream_
FT_LOG_ERROR("--------> rank = %d, step = %d, seq_len = %d <--------", tp_rank, step, h_seq_len); FT_LOG_ERROR("--------> rank = %d, step = %d, seq_len = %d <--------", tp_rank, step, h_seq_len);
} }
} // namespace fastertransformer } // namespace fastertransformer
\ No newline at end of file
...@@ -157,4 +157,4 @@ bool isDebug() ...@@ -157,4 +157,4 @@ bool isDebug()
return is_debug; return is_debug;
} }
} // namespace fastertransformer } // namespace fastertransformer
\ No newline at end of file
...@@ -18,8 +18,7 @@ enum QuantPolicy { ...@@ -18,8 +18,7 @@ enum QuantPolicy {
kCacheKVInt8 = 0x04, kCacheKVInt8 = 0x04,
}; };
enum CmpMode enum CmpMode {
{
kCmpNone, kCmpNone,
kCmpRead, kCmpRead,
kCmpWrite, kCmpWrite,
...@@ -65,4 +64,4 @@ size_t curandStateGetSize(); ...@@ -65,4 +64,4 @@ size_t curandStateGetSize();
bool isDebug(); bool isDebug();
} // namespace fastertransformer } // namespace fastertransformer
\ No newline at end of file
...@@ -6,4 +6,4 @@ template<typename T> ...@@ -6,4 +6,4 @@ template<typename T>
void invokeInsertKeyCache(T* key_cache, const T* src, int L, int H, int Dx, int s, int X, int S, cudaStream_t st); void invokeInsertKeyCache(T* key_cache, const T* src, int L, int H, int Dx, int s, int X, int S, cudaStream_t st);
template<typename T> template<typename T>
void invokeInsertValueCache(T* value_cache, const T* src, int L, int H, int s, int D, int S, cudaStream_t st); void invokeInsertValueCache(T* value_cache, const T* src, int L, int H, int s, int D, int S, cudaStream_t st);
\ No newline at end of file
...@@ -25,7 +25,8 @@ ...@@ -25,7 +25,8 @@
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Modified from https://github.com/triton-inference-server/fastertransformer_backend/blob/main/src/libfastertransformer.cc // Modified from
// https://github.com/triton-inference-server/fastertransformer_backend/blob/main/src/libfastertransformer.cc
#include <stdint.h> #include <stdint.h>
...@@ -1399,7 +1400,7 @@ void ModelInstanceState::SetInputTensors( ...@@ -1399,7 +1400,7 @@ void ModelInstanceState::SetInputTensors(
auto batch_input_name = batch_input.TargetNames()[0]; auto batch_input_name = batch_input.TargetNames()[0];
// we only take care of the ragged input_ids // we only take care of the ragged input_ids
// Assume the first dimention (length) are different and others are the // Assume the first dimension (length) are different and others are the
// same BATCH_ITEM_SHAPE [num_requests (batches), num_dims (excluding // same BATCH_ITEM_SHAPE [num_requests (batches), num_dims (excluding
// batch dimension)] // batch dimension)]
if (batch_input_kind == BatchInput::Kind::BATCH_ITEM_SHAPE if (batch_input_kind == BatchInput::Kind::BATCH_ITEM_SHAPE
...@@ -1464,7 +1465,7 @@ void ModelInstanceState::SetInputTensors( ...@@ -1464,7 +1465,7 @@ void ModelInstanceState::SetInputTensors(
param.batch_input_ptr + param.batch_intput_size, param.batch_input_ptr + param.batch_intput_size,
[&](int x) { return x != param.batch_input_ptr[0]; }); [&](int x) { return x != param.batch_input_ptr[0]; });
// calculate statics of elements // calculate statistics of elements
if (param.is_input_ragged) { if (param.is_input_ragged) {
param.max_elements_per_seq = param.max_elements_per_seq =
*std::max_element(param.batch_input_ptr, param.batch_input_ptr + param.batch_intput_size); *std::max_element(param.batch_input_ptr, param.batch_input_ptr + param.batch_intput_size);
......
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. #Copyright(c) 2021 - 2022, NVIDIA CORPORATION.All rights reserved.
# #
# Redistribution and use in source and binary forms, with or without #Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions #modification, are permitted provided that the following conditions
# are met: #are met:
# * Redistributions of source code must retain the above copyright #* Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer. #notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright #* Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the #notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution. #documentation and / or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its #* Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived #contributors may be used to endorse or promote products derived
# from this software without specific prior written permission. #from this software without specific prior written permission.
# #
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY #THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR #PURPOSE ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #EXEMPLARY, OR CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY #PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT #OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE #(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
{ {
global: global:
TRITONBACKEND_*; TRITONBACKEND_*;
local: *; local:
*;
}; };
...@@ -309,7 +309,8 @@ std::string LlamaTritonModel<T>::toString() ...@@ -309,7 +309,8 @@ std::string LlamaTritonModel<T>::toString()
<< "\nuse_context_fmha: " << use_context_fmha_ << "\nstart_id: " << start_id_ << "\nuse_context_fmha: " << use_context_fmha_ << "\nstart_id: " << start_id_
<< "\ntensor_para_size: " << tensor_para_size_ << "\npipeline_para_size: " << pipeline_para_size_ << "\ntensor_para_size: " << tensor_para_size_ << "\npipeline_para_size: " << pipeline_para_size_
<< "\nenable_custom_all_reduce: " << enable_custom_all_reduce_ << "\nmodel_name: " << model_name_ << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_ << "\nmodel_name: " << model_name_
<< "\nprefix_cache_len: " << prefix_cache_len_ << "\nmodel_dir: " << model_dir_ << "\nquant_policy: " << quant_policy_ << std::endl; << "\nprefix_cache_len: " << prefix_cache_len_ << "\nmodel_dir: " << model_dir_
<< "\nquant_policy: " << quant_policy_ << std::endl;
return ss.str(); return ss.str();
} }
......
...@@ -15,7 +15,8 @@ ...@@ -15,7 +15,8 @@
* limitations under the License. * limitations under the License.
*/ */
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h" #include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp" #include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
......
...@@ -15,7 +15,8 @@ ...@@ -15,7 +15,8 @@
* limitations under the License. * limitations under the License.
*/ */
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
#pragma once #pragma once
......
...@@ -15,7 +15,8 @@ ...@@ -15,7 +15,8 @@
* limitations under the License. * limitations under the License.
*/ */
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.cpp // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.cpp
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp" #include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
#include "src/fastertransformer/utils/nccl_utils.h" #include "src/fastertransformer/utils/nccl_utils.h"
......
...@@ -15,7 +15,8 @@ ...@@ -15,7 +15,8 @@
* limitations under the License. * limitations under the License.
*/ */
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.hpp // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.hpp
#pragma once #pragma once
......
...@@ -76,7 +76,7 @@ if(ENABLE_FP8) ...@@ -76,7 +76,7 @@ if(ENABLE_FP8)
add_library(cublasFP8MMWrapper STATIC cublasFP8MMWrapper.cu) add_library(cublasFP8MMWrapper STATIC cublasFP8MMWrapper.cu)
set_property(TARGET cublasFP8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET cublasFP8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET cublasFP8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_property(TARGET cublasFP8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(cublasFP8MMWrapper PUBLIC -lcublasLt -lcudart -lcurand target_link_libraries(cublasFP8MMWrapper PUBLIC -lcublasLt -lcudart -lcurand
cublasAlgoMap cublasMMWrapper nvtx_utils fp8_qgmma_1x1_utils) cublasAlgoMap cublasMMWrapper nvtx_utils fp8_qgmma_1x1_utils)
endif() endif()
......
...@@ -35,8 +35,7 @@ ...@@ -35,8 +35,7 @@
namespace fastertransformer { namespace fastertransformer {
typedef enum datatype_enum typedef enum datatype_enum {
{
TYPE_INVALID, TYPE_INVALID,
TYPE_BOOL, TYPE_BOOL,
TYPE_UINT8, TYPE_UINT8,
...@@ -99,8 +98,7 @@ DataType getTensorType() ...@@ -99,8 +98,7 @@ DataType getTensorType()
} }
} }
typedef enum memorytype_enum typedef enum memorytype_enum {
{
MEMORY_CPU, MEMORY_CPU,
MEMORY_CPU_PINNED, MEMORY_CPU_PINNED,
MEMORY_GPU MEMORY_GPU
......
...@@ -63,4 +63,4 @@ inline bool isGatedActivation(ActivationType activaiton_type) ...@@ -63,4 +63,4 @@ inline bool isGatedActivation(ActivationType activaiton_type)
|| activaiton_type == ActivationType::SiGLU; || activaiton_type == ActivationType::SiGLU;
} }
} // namespace fastertransformer } // namespace fastertransformer
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment