Unverified Commit 4c9959f6 authored by Chen Xin's avatar Chen Xin Committed by GitHub
Browse files

Support windows platform (#209)

* __PRETTY_FUNCTION__

* CASE_K

* uint

* remove not

* HALF_FLT_MAX

* struct init

* port utils

* better build pthread-win32

* port kernels

* port utils/gemm_test

* hide windows header

* port models

* port examples && triton_backend && unittests

* update build readme

* fix lint

* fix lint

* fix lint

* fix lint

* fix lint

* fix build

* fix build

* cmake version

* fix typos

* update ci

* port kernels/gemm_s_f16

* update ci

* fix ci

* use cudaStreamSynchronize instead of volatile check

* remove gettimeofday

* remove pthread-win32

* remove dirent.h

* update pre-commit

* update

* remove todo

* fix include

* fix build

* fix build

* fix build ci

* fix github action trigger

* update README

* fix linux-build ci

* remove windows folder

* fix lint

* update readme
parent 0d21f366
name: linux-x64-gpu
on:
push:
paths:
- '.github/workflows/linux-x64-gpu.yml'
- 'src/**'
- 'CMakeLists.txt'
- 'cmake/**'
- 'examples/**'
- '3rdparty/**'
- 'tests/csrc/**'
pull_request:
paths:
- '.github/workflows/linux-x64-gpu.yml'
- 'src/**'
- 'CMakeLists.txt'
- 'cmake/**'
- 'examples/**'
- '3rdparty/**'
- 'tests/csrc/**'
concurrency:
group: linux-x64-gpu-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: read
jobs:
cuda-118:
runs-on: ubuntu-latest
container: openmmlab/lmdeploy-builder:cuda11.8
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Build
run: |
source /opt/conda/bin/activate
conda activate py38
mkdir build && cd build
bash ../generate.sh
make -j$(nproc) && make install
...@@ -10,7 +10,7 @@ on: ...@@ -10,7 +10,7 @@ on:
jobs: jobs:
build: linux-build:
strategy: strategy:
matrix: matrix:
pyver: [py38, py39, py310, py311] pyver: [py38, py39, py310, py311]
...@@ -39,11 +39,49 @@ jobs: ...@@ -39,11 +39,49 @@ jobs:
path: builder/manywheel/${{ env.OUTPUT_FOLDER }}/* path: builder/manywheel/${{ env.OUTPUT_FOLDER }}/*
retention-days: 1 retention-days: 1
windows-build:
strategy:
matrix:
pyver: ['3.8', '3.9', '3.10', '3.11']
runs-on: windows-latest
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Set up python
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.pyver }}
- name: Install python packages
run: |
pip install pybind11 wheel
- uses: Jimver/cuda-toolkit@v0.2.11
id: cuda-toolkit
with:
cuda: '11.8.0'
use-github-cache: false
- name: Build wheel
run: |
mkdir build
cd build
..\builder\windows\generate.ps1
cmake --build . --config Release -- /m > build.log.txt
cmake --install . --config Release
cd ..
rm build -Force -Recurse
python setup.py bdist_wheel -d build/wheel
- name: Upload Artifacts
uses: actions/upload-artifact@v3
with:
if-no-files-found: error
path: build/wheel/*
retention-days: 1
publish: publish:
runs-on: ubuntu-latest runs-on: ubuntu-latest
environment: 'prod' environment: 'prod'
needs: needs:
- build - linux-build
- windows-build
steps: steps:
- name: Download artifacts - name: Download artifacts
uses: actions/download-artifact@v3 uses: actions/download-artifact@v3
......
name: windows-x64-gpu
on:
push:
paths:
- '.github/workflows/windows-x64-gpu.yml'
- 'src/**'
- 'CMakeLists.txt'
- 'cmake/**'
- 'examples/**'
- '3rdparty/**'
- 'tests/csrc/**'
pull_request:
paths:
- '.github/workflows/windows-x64-gpu.yml'
- 'src/**'
- 'CMakeLists.txt'
- 'cmake/**'
- 'examples/**'
- '3rdparty/**'
- 'tests/csrc/**'
concurrency:
group: windows-x64-gpu-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: read
jobs:
cuda-118:
runs-on: windows-latest
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Set up python
uses: actions/setup-python@v4
with:
python-version: '3.8'
- name: Install python packages
run: |
pip install pybind11 wheel
- uses: Jimver/cuda-toolkit@v0.2.11
id: cuda-toolkit
with:
cuda: '11.8.0'
use-github-cache: false
- name: Build wheel
run: |
((Get-Content -path CMakeLists.txt -Raw) -replace '-Wall','/W0') | Set-Content CMakeLists.txt
$env:BUILD_TEST="ON"
mkdir build
cd build
..\builder\windows\generate.ps1
cmake --build . --config Release -- /m /v:q
if (-Not $?) {
echo "build failed"
exit 1
}
cmake --install . --config Release
cd ..
rm build -Force -Recurse
python setup.py bdist_wheel -d build/wheel
...@@ -168,6 +168,7 @@ if(BUILD_PYT) ...@@ -168,6 +168,7 @@ if(BUILD_PYT)
set(TORCH_CUDA_ARCH_LIST $ENV{TORCH_CUDA_ARCH_LIST}) set(TORCH_CUDA_ARCH_LIST $ENV{TORCH_CUDA_ARCH_LIST})
endif() endif()
set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wall -O0") set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wall -O0")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall -O0") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall -O0")
# set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall --ptxas-options=-v --resource-usage") # set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall --ptxas-options=-v --resource-usage")
...@@ -288,6 +289,15 @@ if(BUILD_TEST) ...@@ -288,6 +289,15 @@ if(BUILD_TEST)
add_subdirectory(tests/csrc) add_subdirectory(tests/csrc)
endif() endif()
# install python api
if (BUILD_PY_FFI)
install(TARGETS _turbomind DESTINATION ${CMAKE_SOURCE_DIR}/lmdeploy/lib)
endif ()
if (MSVC)
return()
endif ()
# # Mesaure the compile time # # Mesaure the compile time
option(MEASURE_BUILD_TIME "Measure the build time of each module" OFF) option(MEASURE_BUILD_TIME "Measure the build time of each module" OFF)
if (MEASURE_BUILD_TIME) if (MEASURE_BUILD_TIME)
...@@ -387,13 +397,6 @@ install( ...@@ -387,13 +397,6 @@ install(
${INSTALL_CONFIGDIR} ${INSTALL_CONFIGDIR}
) )
# install python api
if (BUILD_PY_FFI)
install(TARGETS _turbomind DESTINATION ${CMAKE_SOURCE_DIR}/lmdeploy/lib)
endif ()
install(TARGETS TransformerTritonBackend DESTINATION ${CMAKE_SOURCE_DIR}/lmdeploy/lib)
export( export(
EXPORT EXPORT
transformer-shared-targets transformer-shared-targets
......
...@@ -13,6 +13,7 @@ ______________________________________________________________________ ...@@ -13,6 +13,7 @@ ______________________________________________________________________
## News 🎉 ## News 🎉
- \[2023/08\] TurboMind supports Windows (tp=1)
- \[2023/08\] TurboMind supports 4-bit inference, 2.4x faster than FP16, the fastest open-source implementation🚀. Check [this](./docs/en/w4a16.md) guide for detailed info - \[2023/08\] TurboMind supports 4-bit inference, 2.4x faster than FP16, the fastest open-source implementation🚀. Check [this](./docs/en/w4a16.md) guide for detailed info
- \[2023/08\] LMDeploy has launched on the [HuggingFace Hub](https://huggingface.co/lmdeploy), providing ready-to-use 4-bit models. - \[2023/08\] LMDeploy has launched on the [HuggingFace Hub](https://huggingface.co/lmdeploy), providing ready-to-use 4-bit models.
- \[2023/08\] LMDeploy supports 4-bit quantization using the [AWQ](https://arxiv.org/abs/2306.00978) algorithm. - \[2023/08\] LMDeploy supports 4-bit quantization using the [AWQ](https://arxiv.org/abs/2306.00978) algorithm.
......
...@@ -13,6 +13,7 @@ ______________________________________________________________________ ...@@ -13,6 +13,7 @@ ______________________________________________________________________
## 更新 🎉 ## 更新 🎉
- \[2023/08\] TurboMind 支持 Windows (tp=1)
- \[2023/08\] TurboMind 支持 4-bit 推理,速度是 FP16 的 2.4 倍,是目前最快的开源实现🚀。部署方式请看[这里](./docs/zh_cn/w4a16.md) - \[2023/08\] TurboMind 支持 4-bit 推理,速度是 FP16 的 2.4 倍,是目前最快的开源实现🚀。部署方式请看[这里](./docs/zh_cn/w4a16.md)
- \[2023/08\] LMDeploy 开通了 [HuggingFace Hub](https://huggingface.co/lmdeploy) ,提供开箱即用的 4-bit 模型 - \[2023/08\] LMDeploy 开通了 [HuggingFace Hub](https://huggingface.co/lmdeploy) ,提供开箱即用的 4-bit 模型
- \[2023/08\] LMDeploy 支持使用 [AWQ](https://arxiv.org/abs/2306.00978) 算法进行 4-bit 量化 - \[2023/08\] LMDeploy 支持使用 [AWQ](https://arxiv.org/abs/2306.00978) 算法进行 4-bit 量化
......
...@@ -6,6 +6,8 @@ PLAT_NAME="$2" ...@@ -6,6 +6,8 @@ PLAT_NAME="$2"
DOCKER_TAG="$3" DOCKER_TAG="$3"
OUTPUT_DIR="$4" OUTPUT_DIR="$4"
GIT_REMOTE=${GIT_REMOTE:-https://github.com/InternLM/lmdeploy}
GIT_BRANCH=${GIT_BRANCH:-main}
DOCKER_IMAGE="openmmlab/lmdeploy-builder:${DOCKER_TAG}" DOCKER_IMAGE="openmmlab/lmdeploy-builder:${DOCKER_TAG}"
export USERID=$(id -u) export USERID=$(id -u)
export GROUPID=$(id -g) export GROUPID=$(id -g)
...@@ -18,6 +20,8 @@ docker run --rm -it \ ...@@ -18,6 +20,8 @@ docker run --rm -it \
--env PLAT_NAME="${PLAT_NAME}" \ --env PLAT_NAME="${PLAT_NAME}" \
--env USERID="${USERID}" \ --env USERID="${USERID}" \
--env GROUPID="${GROUPID}" \ --env GROUPID="${GROUPID}" \
--env GIT_BRANCH="${GIT_BRANCH}" \
--env GIT_REMOTE="${GIT_REMOTE}" \
--volume "$(pwd)/${OUTPUT_DIR}:/lmdeploy_build" \ --volume "$(pwd)/${OUTPUT_DIR}:/lmdeploy_build" \
--volume "$(pwd)/entrypoint_build.sh:/entrypoint_build.sh" \ --volume "$(pwd)/entrypoint_build.sh:/entrypoint_build.sh" \
--entrypoint /entrypoint_build.sh \ --entrypoint /entrypoint_build.sh \
......
...@@ -7,14 +7,20 @@ export USERID=${USERID} ...@@ -7,14 +7,20 @@ export USERID=${USERID}
export GROUPID=${GROUPID} export GROUPID=${GROUPID}
export CUDAVER=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\).*$/\1/p') export CUDAVER=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\).*$/\1/p')
export GIT_REMOTE=${GIT_REMOTE:-https://github.com/InternLM/lmdeploy}
export GIT_BRANCH=${GIT_BRANCH:-main}
source /opt/conda/bin/activate source /opt/conda/bin/activate
conda activate $PYTHON_VERSION conda activate $PYTHON_VERSION
git clone https://github.com/InternLM/lmdeploy git clone -b ${GIT_BRANCH} ${GIT_REMOTE}
cd lmdeploy cd lmdeploy
mkdir build && cd build mkdir build && cd build
bash ../generate.sh bash ../generate.sh
make -j$(nproc) && make install make -j$(nproc) && make install
if [ $? != 0 ]; then
echo "build failed"
exit 1
fi
cd .. cd ..
rm -rf build rm -rf build
python setup.py bdist_wheel --cuda=${CUDAVER} --plat-name $PLAT_NAME -d /tmpbuild/ python setup.py bdist_wheel --cuda=${CUDAVER} --plat-name $PLAT_NAME -d /tmpbuild/
......
# Build lmdeploy on windows
## Requirements
- [CMake 3.17+](https://github.com/Kitware/CMake/releases)
- [Visual Studio 2019+](https://visualstudio.microsoft.com/downloads/)
- [CUDA Toolkit 11.8+](https://developer.nvidia.com/cuda-toolkit-archive)
## Build lmdeploy wheel
```powershell
mkdir build
cd build
..\builder\windows\generate.ps1
cmake --build . --config Release -- /m
cmake --install . --config Release
cd ..
rm build -Force -Recurse
python setup.py bdist_wheel -d build\wheel
```
cmake .. -A x64 -T v142,cuda="$env:CUDA_PATH" `
-DCMAKE_BUILD_TYPE=Release `
-DCMAKE_INSTALL_PREFIX=install `
-DBUILD_PY_FFI=ON `
-DBUILD_MULTI_GPU=OFF `
-DCMAKE_CUDA_FLAGS="-lineinfo" `
-DUSE_NVTX=ON `
-DBUILD_TEST="$env:BUILD_TEST"
...@@ -3,6 +3,6 @@ ...@@ -3,6 +3,6 @@
add_executable(llama_triton_example llama_triton_example.cc) add_executable(llama_triton_example llama_triton_example.cc)
target_link_libraries(llama_triton_example PUBLIC -lcublas -lcublasLt -lcudart target_link_libraries(llama_triton_example PUBLIC -lcublas -lcublasLt -lcudart
LlamaTritonBackend TransformerTritonBackend mpi_utils nccl_utils LlamaTritonBackend TransformerTritonBackend mpi_utils nccl_utils
nvtx_utils word_list) nvtx_utils word_list -lpthread)
install(TARGETS llama_triton_example DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) install(TARGETS llama_triton_example DESTINATION ${CMAKE_INSTALL_PREFIX}/bin)
...@@ -15,12 +15,15 @@ ...@@ -15,12 +15,15 @@
* limitations under the License. * limitations under the License.
*/ */
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/examples/cpp/multi_gpu_gpt/multi_gpu_gpt_triton_example.cc // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/examples/cpp/multi_gpu_gpt/multi_gpu_gpt_triton_example.cc
#include "3rdparty/INIReader.h" #include "3rdparty/INIReader.h"
#include <chrono>
#include <memory> #include <memory>
#include <thread> #include <thread>
#include "src/turbomind/macro.h"
#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h" #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
#include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h" #include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/turbomind/triton_backend/transformer_triton_backend.hpp" #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
...@@ -427,6 +430,7 @@ int main(int argc, char* argv[]) ...@@ -427,6 +430,7 @@ int main(int argc, char* argv[])
const int batch_size = output_tensors_lists[0].get()->at("output_ids").shape[0]; const int batch_size = output_tensors_lists[0].get()->at("output_ids").shape[0];
const int beam_width = output_tensors_lists[0].get()->at("output_ids").shape[1]; const int beam_width = output_tensors_lists[0].get()->at("output_ids").shape[1];
const int seq_len = output_tensors_lists[0].get()->at("output_ids").shape[2]; const int seq_len = output_tensors_lists[0].get()->at("output_ids").shape[2];
std::vector<int> seq_lens(batch_size); std::vector<int> seq_lens(batch_size);
// step 6: check results // step 6: check results
if (node_id == 0) { if (node_id == 0) {
...@@ -473,8 +477,7 @@ int main(int argc, char* argv[]) ...@@ -473,8 +477,7 @@ int main(int argc, char* argv[])
if (1) { if (1) {
// test time // test time
struct timeval start, end; auto start = std::chrono::high_resolution_clock::now();
gettimeofday(&start, NULL);
const int ite = 1; const int ite = 1;
for (int i = 0; i < ite; i++) { for (int i = 0; i < ite; i++) {
...@@ -497,14 +500,15 @@ int main(int argc, char* argv[]) ...@@ -497,14 +500,15 @@ int main(int argc, char* argv[])
ft::mpi::barrier(); ft::mpi::barrier();
} }
gettimeofday(&end, NULL); auto end = std::chrono::high_resolution_clock::now();
auto dur = std::chrono::duration<float, std::milli>(end - start);
printf("[INFO] batch_size %d beam_width %d seq_len %d" printf("[INFO] batch_size %d beam_width %d seq_len %d"
" FT-CPP-GPT-Triton-time %.2f ms\n", " FT-CPP-GPT-Triton-time %.2f ms\n",
batch_size, batch_size,
beam_width, beam_width,
seq_lens[0], seq_lens[0],
((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001) / ite); dur.count() / ite);
} }
if (kUSE_MPI) { if (kUSE_MPI) {
...@@ -551,7 +555,6 @@ int read_start_ids(size_t batch_size, ...@@ -551,7 +555,6 @@ int read_start_ids(size_t batch_size,
return 0; return 0;
} }
// Add padding // Add padding
for (int i = 0; i < (int)tmp_start_ids.size(); i++) { for (int i = 0; i < (int)tmp_start_ids.size(); i++) {
for (int j = (int)tmp_start_ids[i].size(); j < max_input_len; j++) { for (int j = (int)tmp_start_ids[i].size(); j < max_input_len; j++) {
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
*/ */
#include "src/turbomind/kernels/activation_kernels.h" #include "src/turbomind/kernels/activation_kernels.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/cuda_type_utils.cuh" #include "src/turbomind/utils/cuda_type_utils.cuh"
#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
......
...@@ -101,7 +101,7 @@ void invokeBanBadWords(T* logits, ...@@ -101,7 +101,7 @@ void invokeBanBadWords(T* logits,
cudaStream_t stream) cudaStream_t stream)
{ {
dim3 block, grid; dim3 block, grid;
block.x = min(((bad_words_len + 32 - 1) / 32) * 32, 256UL); block.x = min((unsigned long)((bad_words_len + 32 - 1) / 32) * 32, 256UL);
grid.x = (bad_words_len + block.x - 1) / block.x; grid.x = (bad_words_len + block.x - 1) / block.x;
grid.y = local_batch_size * beam_width; grid.y = local_batch_size * beam_width;
......
...@@ -63,7 +63,11 @@ void invokeGetPaddingOffsetAndCuSeqLens(size_t* h_pinned_token_num, ...@@ -63,7 +63,11 @@ void invokeGetPaddingOffsetAndCuSeqLens(size_t* h_pinned_token_num,
h_pinned_token_num[0] = 0; h_pinned_token_num[0] = 0;
getPaddingOffsetAndCuSeqLensKernel<<<1, 1, 0, stream>>>( getPaddingOffsetAndCuSeqLensKernel<<<1, 1, 0, stream>>>(
h_pinned_token_num, tmp_mask_offset, cu_seqlens, sequence_lengths, batch_size, max_seq_len); h_pinned_token_num, tmp_mask_offset, cu_seqlens, sequence_lengths, batch_size, max_seq_len);
#ifdef _MSC_VER
cudaStreamSynchronize(stream);
#else
while (((volatile size_t*)h_pinned_token_num)[0] == 0) {}; while (((volatile size_t*)h_pinned_token_num)[0] == 0) {};
#endif
h_token_num[0] = h_pinned_token_num[0]; h_token_num[0] = h_pinned_token_num[0];
sync_check_cuda_error(); sync_check_cuda_error();
} }
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "src/turbomind/kernels/decoder_masked_multihead_attention.h" #include "src/turbomind/kernels/decoder_masked_multihead_attention.h"
#include "src/turbomind/kernels/decoder_masked_multihead_attention_utils.h" #include "src/turbomind/kernels/decoder_masked_multihead_attention_utils.h"
#include "src/turbomind/macro.h"
// #include "src/turbomind/utils/cuda_bf16_wrapper.h" // #include "src/turbomind/utils/cuda_bf16_wrapper.h"
// #include "src/turbomind/utils/cuda_fp8_utils.h" // #include "src/turbomind/utils/cuda_fp8_utils.h"
#include "src/turbomind/utils/cuda_type_utils.cuh" #include "src/turbomind/utils/cuda_type_utils.cuh"
...@@ -1469,7 +1470,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T> ...@@ -1469,7 +1470,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
int offset = bhi * params.memory_max_len * Dh + co * params.memory_max_len * QK_ELTS_IN_16B int offset = bhi * params.memory_max_len * Dh + co * params.memory_max_len * QK_ELTS_IN_16B
+ tlength_circ * QK_ELTS_IN_16B + ci; + tlength_circ * QK_ELTS_IN_16B + ci;
if (not QUANT_POLICY) { if (!QUANT_POLICY) {
*reinterpret_cast<Qk_vec_m*>(&params.k_cache[offset]) = vec_conversion<Qk_vec_m, Qk_vec_k>(k); *reinterpret_cast<Qk_vec_m*>(&params.k_cache[offset]) = vec_conversion<Qk_vec_m, Qk_vec_k>(k);
} }
else if (QUANT_POLICY == 4) { else if (QUANT_POLICY == 4) {
...@@ -1491,7 +1492,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T> ...@@ -1491,7 +1492,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
+ tlength_circ * Dh + co * QK_ELTS_IN_16B + ci; + tlength_circ * Dh + co * QK_ELTS_IN_16B + ci;
} }
if (not QUANT_POLICY) { if (!QUANT_POLICY) {
*reinterpret_cast<Qk_vec_m*>(&params.k_cache_per_sample[bi][offset]) = *reinterpret_cast<Qk_vec_m*>(&params.k_cache_per_sample[bi][offset]) =
vec_conversion<Qk_vec_m, Qk_vec_k>(k); vec_conversion<Qk_vec_m, Qk_vec_k>(k);
} }
...@@ -1582,7 +1583,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T> ...@@ -1582,7 +1583,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
T* k_cache_batch = nullptr; T* k_cache_batch = nullptr;
int8_t* k_cache_batch_int8 = nullptr; int8_t* k_cache_batch_int8 = nullptr;
if (not QUANT_POLICY) { if (!QUANT_POLICY) {
k_cache_batch = params.k_cache_per_sample ? (params.k_cache_per_sample[bi] + params.kv_cache_per_sample_offset k_cache_batch = params.k_cache_per_sample ? (params.k_cache_per_sample[bi] + params.kv_cache_per_sample_offset
+ kvhi * params.memory_max_len * Dh + ki) : + kvhi * params.memory_max_len * Dh + ki) :
&params.k_cache[bhi * params.memory_max_len * Dh + ki]; &params.k_cache[bhi * params.memory_max_len * Dh + ki];
...@@ -1635,7 +1636,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T> ...@@ -1635,7 +1636,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
beam_offset = beam_indices[ti_circ] * params.num_heads * params.memory_max_len * Dh; beam_offset = beam_indices[ti_circ] * params.num_heads * params.memory_max_len * Dh;
} }
if (not QUANT_POLICY) { if (!QUANT_POLICY) {
k[ii] = vec_conversion<K_vec_k, K_vec_m>( k[ii] = vec_conversion<K_vec_k, K_vec_m>(
(*reinterpret_cast<const K_vec_m*>(&k_cache_batch[beam_offset + jj * QK_ELTS_IN_16B]))); (*reinterpret_cast<const K_vec_m*>(&k_cache_batch[beam_offset + jj * QK_ELTS_IN_16B])));
} }
...@@ -1769,7 +1770,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T> ...@@ -1769,7 +1770,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
int8_t* v_cache_int8 = nullptr; int8_t* v_cache_int8 = nullptr;
int8_t* v_cache_batch_int8 = nullptr; int8_t* v_cache_batch_int8 = nullptr;
if (not QUANT_POLICY) { if (!QUANT_POLICY) {
v_cache = params.v_cache_per_sample ? (params.v_cache_per_sample[bi] + params.kv_cache_per_sample_offset v_cache = params.v_cache_per_sample ? (params.v_cache_per_sample[bi] + params.kv_cache_per_sample_offset
+ kvhi * params.memory_max_len * Dh + vi) : + kvhi * params.memory_max_len * Dh + vi) :
...@@ -1824,7 +1825,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T> ...@@ -1824,7 +1825,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
// Load the values from the cache. // Load the values from the cache.
V_vec_k v; V_vec_k v;
if (not QUANT_POLICY) { if (!QUANT_POLICY) {
v = vec_conversion<V_vec_k, V_vec_m>( v = vec_conversion<V_vec_k, V_vec_m>(
*reinterpret_cast<const V_vec_m*>(&v_cache_batch[beam_offset + ti * Dh])); *reinterpret_cast<const V_vec_m*>(&v_cache_batch[beam_offset + ti * Dh]));
} }
...@@ -1871,7 +1872,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T> ...@@ -1871,7 +1872,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
const int beam_offset = HAS_BEAMS ? beam_src * params.num_heads * params.memory_max_len * Dh : 0; const int beam_offset = HAS_BEAMS ? beam_src * params.num_heads * params.memory_max_len * Dh : 0;
// Load the values from the cache. // Load the values from the cache.
V_vec_k v; V_vec_k v;
if (not QUANT_POLICY) { if (!QUANT_POLICY) {
v = vec_conversion<V_vec_k, V_vec_m>( v = vec_conversion<V_vec_k, V_vec_m>(
*reinterpret_cast<const V_vec_m*>(&v_cache_batch[beam_offset + ti_circ * Dh])); *reinterpret_cast<const V_vec_m*>(&v_cache_batch[beam_offset + ti_circ * Dh]));
} }
...@@ -1931,7 +1932,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T> ...@@ -1931,7 +1932,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
// Store the values with bias back to global memory in the cache for V. // Store the values with bias back to global memory in the cache for V.
//*reinterpret_cast<V_vec_k*>(&v_cache[params.timestep*Dh]) = v; //*reinterpret_cast<V_vec_k*>(&v_cache[params.timestep*Dh]) = v;
if (not QUANT_POLICY) { if (!QUANT_POLICY) {
*reinterpret_cast<V_vec_m*>(&v_cache[tlength_circ * Dh]) = vec_conversion<V_vec_m, V_vec_k>(v); *reinterpret_cast<V_vec_m*>(&v_cache[tlength_circ * Dh]) = vec_conversion<V_vec_m, V_vec_k>(v);
} }
else if (QUANT_POLICY == 4) { else if (QUANT_POLICY == 4) {
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#pragma once #pragma once
#include "src/turbomind/macro.h"
#include <cassert> #include <cassert>
#include <cstdint> #include <cstdint>
#include <cuda_fp16.h> #include <cuda_fp16.h>
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#pragma once #pragma once
#include "metric.h" #include "metric.h"
#include "src/turbomind/macro.h"
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
......
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
#include "src/turbomind/kernels/logprob_kernels.h" #include "src/turbomind/kernels/logprob_kernels.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh" #include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/logger.h" #include "src/turbomind/utils/logger.h"
namespace turbomind { namespace turbomind {
......
...@@ -63,7 +63,7 @@ __device__ inline void copy(const void* local, void* data) ...@@ -63,7 +63,7 @@ __device__ inline void copy(const void* local, void* data)
*out = *in; *out = *in;
} }
static const float HALF_FLT_MAX = 65504.F; #define HALF_FLT_MAX 65504.F
#define FINAL_MASK 0xffffffff #define FINAL_MASK 0xffffffff
template<typename T> template<typename T>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment