Unverified Commit fe46dac2 authored by AllentDan's avatar AllentDan Committed by GitHub
Browse files

Add lint action (#32)

* temp

* fix lint

* csrc->src

* remove clang-format

* skip .rst

* skip doc

* clang-format

version

version

* mat_B
parent e8ab4ba3
......@@ -40,7 +40,7 @@ struct LlamaWeight {
~LlamaWeight();
LlamaWeight(const LlamaWeight& other) = delete;
LlamaWeight(const LlamaWeight& other) = delete;
LlamaWeight& operator=(const LlamaWeight& other) = delete;
void loadModel(std::string dir_path);
......@@ -67,4 +67,4 @@ private:
size_t tensor_para_rank_;
};
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -25,8 +25,7 @@ struct Request {
using Callback = std::function<void(std::unordered_map<std::string, Tensor>*)>;
Callback stream_cb;
enum
{
enum {
kInvalid = 1,
kConflict = 2,
kBusy = 3,
......@@ -88,4 +87,4 @@ private:
std::condition_variable cv_;
};
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -10,7 +10,7 @@
#include <cutlass/half.h>
#include <cutlass/platform/platform.h>
// modifiy from:
// modified from:
// https://github.com/NVIDIA/cutlass/blob/main/examples/41_fused_multi_head_attention/kernel_forward.h
namespace fastertransformer {
......
......@@ -11,7 +11,8 @@ namespace cg = cooperative_groups;
namespace fastertransformer {
template<typename T>
struct res_norm_ops_t {};
struct res_norm_ops_t {
};
template<typename T>
struct res_norm_t {
......@@ -144,7 +145,7 @@ __global__ void fusedAddBiasResidualNorm(T* __restrict__ r_data,
template<typename T>
void invokeFusedAddBiasResidualRMSNorm(
T* residual, T* inout, const T* bias, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream)
T* residual, T* in_out, const T* bias, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream)
{
constexpr int PACK_DIM = sizeof(uint4) / sizeof(T);
FT_CHECK(n_dims % PACK_DIM == 0);
......@@ -154,7 +155,7 @@ void invokeFusedAddBiasResidualRMSNorm(
n_threads = (n_threads + 31) / 32 * 32; // round up to the nearest multiple of warp size
fusedAddBiasResidualNorm<<<batch_size, n_threads, 0, stream>>>(
residual, inout, bias, scale, eps, batch_size, n_dims);
residual, in_out, bias, scale, eps, batch_size, n_dims);
}
template void
......
......@@ -6,6 +6,6 @@ namespace fastertransformer {
template<typename T>
void invokeFusedAddBiasResidualRMSNorm(
T* residual, T* inout, const T* bias, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream);
T* residual, T* in_out, const T* bias, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream);
} // namespace fastertransformer
// Copyright (c) OpenMMLab. All rights reserved.
#include "src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h"
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/models/llama/llama_kernels.h"
#include "src/fastertransformer/models/llama/llama_utils.h"
#include "src/fastertransformer/utils/cuda_type_utils.cuh"
#include "src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h"
namespace fastertransformer {
......@@ -293,37 +293,40 @@ inline __device__ float2 float2div(float a, float2 b)
return c;
}
static inline __device__ half4 char4_scale_to_half4(char4 value, const float scale) {
half4 dst;
dst.x = __float2half(value.x * scale);
dst.y = __float2half(value.y * scale);
dst.z = __float2half(value.z * scale);
dst.w = __float2half(value.w * scale);
return dst;
static inline __device__ half4 char4_scale_to_half4(char4 value, const float scale)
{
half4 dst;
dst.x = __float2half(value.x * scale);
dst.y = __float2half(value.y * scale);
dst.z = __float2half(value.z * scale);
dst.w = __float2half(value.w * scale);
return dst;
}
static inline __device__ uint32_t float4_to_char4(float x,
float y,
float z,
float w) {
uint32_t dst;
static inline __device__ uint32_t float4_to_char4(float x, float y, float z, float w)
{
uint32_t dst;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 720
uint32_t a; asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(a) : "f"(x));
uint32_t b; asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(b) : "f"(y));
uint32_t c; asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(c) : "f"(z));
uint32_t d; asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(d) : "f"(w));
asm volatile("cvt.pack.sat.s8.s32.b32 %0, %1, %2, 0;\n" : "=r"(dst) : "r"(d), "r"(c));
asm volatile("cvt.pack.sat.s8.s32.b32 %0, %1, %2, %0;\n" : "+r"(dst) : "r"(b), "r"(a));
uint32_t a;
asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(a) : "f"(x));
uint32_t b;
asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(b) : "f"(y));
uint32_t c;
asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(c) : "f"(z));
uint32_t d;
asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(d) : "f"(w));
asm volatile("cvt.pack.sat.s8.s32.b32 %0, %1, %2, 0;\n" : "=r"(dst) : "r"(d), "r"(c));
asm volatile("cvt.pack.sat.s8.s32.b32 %0, %1, %2, %0;\n" : "+r"(dst) : "r"(b), "r"(a));
#else
char4 tmp;
tmp.x = x;
tmp.y = y;
tmp.z = z;
tmp.w = w;
dst = reinterpret_cast<const uint32_t&>(tmp);
char4 tmp;
tmp.x = x;
tmp.y = y;
tmp.z = z;
tmp.w = w;
dst = reinterpret_cast<const uint32_t&>(tmp);
#endif
return dst;
return dst;
}
template<typename T>
......@@ -380,7 +383,6 @@ __global__ void extend_value_cache_int8(int8_t** v_dst,
}
}
template<typename T>
void invokeExtendKVCache(T** k_dst,
T** v_dst,
......@@ -404,18 +406,48 @@ void invokeExtendKVCache(T** k_dst,
dim3 grid((max_q_len * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num);
if (quant & QuantPolicy::kCacheKVInt8) {
extend_value_cache_int8<<<grid, block_sz, 0, stream>>>(
reinterpret_cast<int8_t**>(k_dst), dst_offset, k_src, local_head_num, size_per_head, query_length, history_length, max_q_len, max_seq_len, kv_scale[0]);
extend_value_cache_int8<<<grid, block_sz, 0, stream>>>(
reinterpret_cast<int8_t**>(v_dst), dst_offset, v_src, local_head_num, size_per_head, query_length, history_length, max_q_len, max_seq_len, kv_scale[1]);
} else {
extend_value_cache<<<grid, block_sz, 0, stream>>>(
k_dst, dst_offset, k_src, local_head_num, size_per_head, query_length, history_length, max_q_len, max_seq_len);
extend_value_cache<<<grid, block_sz, 0, stream>>>(
v_dst, dst_offset, v_src, local_head_num, size_per_head, query_length, history_length, max_q_len, max_seq_len);
extend_value_cache_int8<<<grid, block_sz, 0, stream>>>(reinterpret_cast<int8_t**>(k_dst),
dst_offset,
k_src,
local_head_num,
size_per_head,
query_length,
history_length,
max_q_len,
max_seq_len,
kv_scale[0]);
extend_value_cache_int8<<<grid, block_sz, 0, stream>>>(reinterpret_cast<int8_t**>(v_dst),
dst_offset,
v_src,
local_head_num,
size_per_head,
query_length,
history_length,
max_q_len,
max_seq_len,
kv_scale[1]);
}
else {
extend_value_cache<<<grid, block_sz, 0, stream>>>(k_dst,
dst_offset,
k_src,
local_head_num,
size_per_head,
query_length,
history_length,
max_q_len,
max_seq_len);
extend_value_cache<<<grid, block_sz, 0, stream>>>(v_dst,
dst_offset,
v_src,
local_head_num,
size_per_head,
query_length,
history_length,
max_q_len,
max_seq_len);
}
}
......@@ -492,17 +524,16 @@ __global__ void transpose_value_cache(T* v_dst, //
}
}
template<typename T>
__global__ void transpose_value_cache_int8(T* v_dst, //
const int8_t** v_src,
const size_t src_offset,
const int head_num,
const int size_per_head,
const int* seq_length,
const int max_kv_len,
const int max_seq_len,
const float v_scale)
__global__ void transpose_value_cache_int8(T* v_dst, //
const int8_t** v_src,
const size_t src_offset,
const int head_num,
const int size_per_head,
const int* seq_length,
const int max_kv_len,
const int max_seq_len,
const float v_scale)
{
const int batch_id = blockIdx.y;
const int head_id = blockIdx.z;
......@@ -533,7 +564,7 @@ __global__ void transpose_value_cache_int8(T* v_dst, //
// int8x8 -> fp16x8
const auto from_ptr = reinterpret_cast<const char4*>(val_src + src_idx);
auto to_ptr = reinterpret_cast<half4*>(val_dst + dst_idx);
auto to_ptr = reinterpret_cast<half4*>(val_dst + dst_idx);
to_ptr[0] = char4_scale_to_half4(from_ptr[0], v_scale);
to_ptr[1] = char4_scale_to_half4(from_ptr[1], v_scale);
......@@ -562,13 +593,27 @@ void invokeTransposeKVCache(T* key_cache_trans,
dim3 grid((max_kv_len * size_per_head / x + block_sz - 1) / block_sz, batch_size, head_num);
if (quant & QuantPolicy::kCacheKVInt8) {
transpose_value_cache_int8<<<grid, block_sz, 0, stream>>>(
key_cache_trans, reinterpret_cast<const int8_t**>(key_cache), src_offset, head_num, size_per_head, key_length, max_kv_len, max_seq_len, kv_scale[0]);
transpose_value_cache_int8<<<grid, block_sz, 0, stream>>>(
val_cache_trans, reinterpret_cast<const int8_t**>(val_cache), src_offset, head_num, size_per_head, key_length, max_kv_len, max_seq_len, kv_scale[1]);
} else {
transpose_value_cache_int8<<<grid, block_sz, 0, stream>>>(key_cache_trans,
reinterpret_cast<const int8_t**>(key_cache),
src_offset,
head_num,
size_per_head,
key_length,
max_kv_len,
max_seq_len,
kv_scale[0]);
transpose_value_cache_int8<<<grid, block_sz, 0, stream>>>(val_cache_trans,
reinterpret_cast<const int8_t**>(val_cache),
src_offset,
head_num,
size_per_head,
key_length,
max_kv_len,
max_seq_len,
kv_scale[1]);
}
else {
transpose_value_cache<<<grid, block_sz, 0, stream>>>(
key_cache_trans, key_cache, src_offset, head_num, size_per_head, key_length, max_kv_len, max_seq_len);
......@@ -577,10 +622,34 @@ void invokeTransposeKVCache(T* key_cache_trans,
}
}
template void invokeTransposeKVCache(
float*, float*, const float**, const float**, size_t, int, const int*, int, int, int, int, cudaStream_t stream, int, const float*);
template void invokeTransposeKVCache(
half*, half*, const half**, const half**, size_t, int, const int*, int, int, int, int, cudaStream_t stream, int, const float*);
template void invokeTransposeKVCache(float*,
float*,
const float**,
const float**,
size_t,
int,
const int*,
int,
int,
int,
int,
cudaStream_t stream,
int,
const float*);
template void invokeTransposeKVCache(half*,
half*,
const half**,
const half**,
size_t,
int,
const int*,
int,
int,
int,
int,
cudaStream_t stream,
int,
const float*);
__global__ void gatherOutput(int* output_ids,
const int* ids,
......@@ -619,4 +688,4 @@ void invokeGatherOutput(int* output_ids,
output_ids, ids, context_length, max_context_len, max_gen_step, max_output_len, batch_size);
}
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -163,4 +163,4 @@ inline void dump_sequence_len(int* d_seq_len, int step, int tp_rank, cudaStream_
FT_LOG_ERROR("--------> rank = %d, step = %d, seq_len = %d <--------", tp_rank, step, h_seq_len);
}
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -157,4 +157,4 @@ bool isDebug()
return is_debug;
}
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -18,8 +18,7 @@ enum QuantPolicy {
kCacheKVInt8 = 0x04,
};
enum CmpMode
{
enum CmpMode {
kCmpNone,
kCmpRead,
kCmpWrite,
......@@ -65,4 +64,4 @@ size_t curandStateGetSize();
bool isDebug();
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -6,4 +6,4 @@ template<typename T>
void invokeInsertKeyCache(T* key_cache, const T* src, int L, int H, int Dx, int s, int X, int S, cudaStream_t st);
template<typename T>
void invokeInsertValueCache(T* value_cache, const T* src, int L, int H, int s, int D, int S, cudaStream_t st);
\ No newline at end of file
void invokeInsertValueCache(T* value_cache, const T* src, int L, int H, int s, int D, int S, cudaStream_t st);
......@@ -25,7 +25,8 @@
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Modified from https://github.com/triton-inference-server/fastertransformer_backend/blob/main/src/libfastertransformer.cc
// Modified from
// https://github.com/triton-inference-server/fastertransformer_backend/blob/main/src/libfastertransformer.cc
#include <stdint.h>
......@@ -1399,7 +1400,7 @@ void ModelInstanceState::SetInputTensors(
auto batch_input_name = batch_input.TargetNames()[0];
// we only take care of the ragged input_ids
// Assume the first dimention (length) are different and others are the
// Assume the first dimension (length) are different and others are the
// same BATCH_ITEM_SHAPE [num_requests (batches), num_dims (excluding
// batch dimension)]
if (batch_input_kind == BatchInput::Kind::BATCH_ITEM_SHAPE
......@@ -1464,7 +1465,7 @@ void ModelInstanceState::SetInputTensors(
param.batch_input_ptr + param.batch_intput_size,
[&](int x) { return x != param.batch_input_ptr[0]; });
// calculate statics of elements
// calculate statistics of elements
if (param.is_input_ragged) {
param.max_elements_per_seq =
*std::max_element(param.batch_input_ptr, param.batch_input_ptr + param.batch_intput_size);
......
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#Copyright(c) 2021 - 2022, NVIDIA CORPORATION.All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#Redistribution and use in source and binary forms, with or without
#modification, are permitted provided that the following conditions
#are met:
#* Redistributions of source code must retain the above copyright
#notice, this list of conditions and the following disclaimer.
#* Redistributions in binary form must reproduce the above copyright
#notice, this list of conditions and the following disclaimer in the
#documentation and / or other materials provided with the distribution.
#* Neither the name of NVIDIA CORPORATION nor the names of its
#contributors may be used to endorse or promote products derived
#from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
#EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
#IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
#PURPOSE ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR
#CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
#EXEMPLARY, OR CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO,
#PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
#PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
#OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
#OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
{
global:
global:
TRITONBACKEND_*;
local: *;
local:
*;
};
......@@ -309,7 +309,8 @@ std::string LlamaTritonModel<T>::toString()
<< "\nuse_context_fmha: " << use_context_fmha_ << "\nstart_id: " << start_id_
<< "\ntensor_para_size: " << tensor_para_size_ << "\npipeline_para_size: " << pipeline_para_size_
<< "\nenable_custom_all_reduce: " << enable_custom_all_reduce_ << "\nmodel_name: " << model_name_
<< "\nprefix_cache_len: " << prefix_cache_len_ << "\nmodel_dir: " << model_dir_ << "\nquant_policy: " << quant_policy_ << std::endl;
<< "\nprefix_cache_len: " << prefix_cache_len_ << "\nmodel_dir: " << model_dir_
<< "\nquant_policy: " << quant_policy_ << std::endl;
return ss.str();
}
......
......@@ -15,7 +15,8 @@
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
......
......@@ -15,7 +15,8 @@
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
#pragma once
......
......@@ -15,7 +15,8 @@
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.cpp
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.cpp
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
#include "src/fastertransformer/utils/nccl_utils.h"
......
......@@ -15,7 +15,8 @@
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.hpp
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.hpp
#pragma once
......
......@@ -76,7 +76,7 @@ if(ENABLE_FP8)
add_library(cublasFP8MMWrapper STATIC cublasFP8MMWrapper.cu)
set_property(TARGET cublasFP8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET cublasFP8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(cublasFP8MMWrapper PUBLIC -lcublasLt -lcudart -lcurand
target_link_libraries(cublasFP8MMWrapper PUBLIC -lcublasLt -lcudart -lcurand
cublasAlgoMap cublasMMWrapper nvtx_utils fp8_qgmma_1x1_utils)
endif()
......
......@@ -35,8 +35,7 @@
namespace fastertransformer {
typedef enum datatype_enum
{
typedef enum datatype_enum {
TYPE_INVALID,
TYPE_BOOL,
TYPE_UINT8,
......@@ -99,8 +98,7 @@ DataType getTensorType()
}
}
typedef enum memorytype_enum
{
typedef enum memorytype_enum {
MEMORY_CPU,
MEMORY_CPU_PINNED,
MEMORY_GPU
......
......@@ -63,4 +63,4 @@ inline bool isGatedActivation(ActivationType activaiton_type)
|| activaiton_type == ActivationType::SiGLU;
}
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment