Unverified Commit 6ccc0bff authored by TJian's avatar TJian Committed by GitHub
Browse files

Merge EmbeddedLLM/vllm-rocm into vLLM main (#1836)


Co-authored-by: default avatarPhilipp Moritz <pcmoritz@gmail.com>
Co-authored-by: default avatarAmir Balwel <amoooori04@gmail.com>
Co-authored-by: default avatarroot <kuanfu.liu@akirakan.com>
Co-authored-by: default avatartjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: default avatarkuanfu <kuanfu.liu@embeddedllm.com>
Co-authored-by: default avatarmiloice <17350011+kliuae@users.noreply.github.com>
parent c8e7eb1e
...@@ -177,3 +177,7 @@ _build/ ...@@ -177,3 +177,7 @@ _build/
# vim swap files # vim swap files
*.swo *.swo
*.swp *.swp
# hip files generated by PyTorch
*.hip
*_hip*
FROM rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1
# Install some basic utilities
RUN apt-get update && apt-get install python3 python3-pip -y
# Install some basic utilities
RUN apt-get update && apt-get install -y \
curl \
ca-certificates \
sudo \
git \
bzip2 \
libx11-6 \
build-essential \
wget \
unzip \
nvidia-cuda-toolkit \
tmux \
&& rm -rf /var/lib/apt/lists/*
### Mount Point ###
# When launching the container, mount the code directory to /app
ARG APP_MOUNT=/app
VOLUME [ ${APP_MOUNT} ]
WORKDIR ${APP_MOUNT}
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
# Install ROCm flash-attention
RUN mkdir libs \
&& cd libs \
&& git clone https://github.com/ROCmSoftwarePlatform/flash-attention.git \
&& cd flash-attention \
&& git checkout 3d2b6f5 \
&& git submodule update --init \
&& export GPU_ARCHS=$(/opt/rocm/llvm/bin/amdgpu-offload-arch) \
&& patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch \
&& python3 setup.py install \
&& cd ..
COPY ./ /app/vllm
RUN python3 -m pip install --upgrade pip
RUN pip install xformers==0.0.22.post7 --no-deps
RUN cd /app \
&& cd vllm \
&& pip install -U -r requirements-rocm.txt \
&& bash patch_xformers-0.0.22.post7.rocm.sh \
&& python3 setup.py install \
&& cd ..
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir ray[all]
CMD ["/bin/bash"]
...@@ -17,6 +17,7 @@ Easy, fast, and cheap LLM serving for everyone ...@@ -17,6 +17,7 @@ Easy, fast, and cheap LLM serving for everyone
--- ---
*Latest News* 🔥 *Latest News* 🔥
- [2023/12] Added ROCm support to vLLM.
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing). - [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
- [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there. - [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
- [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv! - [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
...@@ -43,6 +44,7 @@ vLLM is flexible and easy to use with: ...@@ -43,6 +44,7 @@ vLLM is flexible and easy to use with:
- Tensor parallelism support for distributed inference - Tensor parallelism support for distributed inference
- Streaming outputs - Streaming outputs
- OpenAI-compatible API server - OpenAI-compatible API server
- Support NVIDIA CUDA and AMD ROCm.
vLLM seamlessly supports many Hugging Face models, including the following architectures: vLLM seamlessly supports many Hugging Face models, including the following architectures:
......
#include <torch/extension.h> #include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include "cuda_compat.h"
#include "dispatch_utils.h" #include "dispatch_utils.h"
namespace vllm { namespace vllm {
...@@ -18,8 +19,8 @@ __global__ void silu_and_mul_kernel( ...@@ -18,8 +19,8 @@ __global__ void silu_and_mul_kernel(
const int d) { const int d) {
const int64_t token_idx = blockIdx.x; const int64_t token_idx = blockIdx.x;
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
const scalar_t x = __ldg(&input[token_idx * 2 * d + idx]); const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
const scalar_t y = __ldg(&input[token_idx * 2 * d + d + idx]); const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
out[token_idx * d + idx] = silu(x) * y; out[token_idx * d + idx] = silu(x) * y;
} }
} }
...@@ -57,7 +58,7 @@ __global__ void activation_kernel( ...@@ -57,7 +58,7 @@ __global__ void activation_kernel(
const int d) { const int d) {
const int64_t token_idx = blockIdx.x; const int64_t token_idx = blockIdx.x;
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
const scalar_t x = __ldg(&input[token_idx * d + idx]); const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
out[token_idx * d + idx] = ACT_FN(x); out[token_idx * d + idx] = ACT_FN(x);
} }
} }
......
...@@ -15,6 +15,10 @@ ...@@ -15,6 +15,10 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
#ifdef USE_ROCM
#include <hip/hip_runtime.h>
#endif
#include <torch/extension.h> #include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
...@@ -23,7 +27,11 @@ ...@@ -23,7 +27,11 @@
#include <algorithm> #include <algorithm>
#ifndef USE_ROCM
#define WARP_SIZE 32 #define WARP_SIZE 32
#else
#define WARP_SIZE warpSize
#endif
#define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b))
#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b)) #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
...@@ -40,7 +48,7 @@ inline __device__ float block_sum(float* red_smem, float sum) { ...@@ -40,7 +48,7 @@ inline __device__ float block_sum(float* red_smem, float sum) {
// Compute the sum per warp. // Compute the sum per warp.
#pragma unroll #pragma unroll
for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) { for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
sum += __shfl_xor_sync(uint32_t(-1), sum, mask); sum += VLLM_SHFL_XOR_SYNC(sum, mask);
} }
// Warp leaders store the data to shared memory. // Warp leaders store the data to shared memory.
...@@ -59,11 +67,11 @@ inline __device__ float block_sum(float* red_smem, float sum) { ...@@ -59,11 +67,11 @@ inline __device__ float block_sum(float* red_smem, float sum) {
// Parallel reduction inside the warp. // Parallel reduction inside the warp.
#pragma unroll #pragma unroll
for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) { for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
sum += __shfl_xor_sync(uint32_t(-1), sum, mask); sum += VLLM_SHFL_XOR_SYNC(sum, mask);
} }
// Broadcast to other threads. // Broadcast to other threads.
return __shfl_sync(uint32_t(-1), sum, 0); return VLLM_SHFL_SYNC(sum, 0);
} }
// TODO(woosuk): Merge the last two dimensions of the grid. // TODO(woosuk): Merge the last two dimensions of the grid.
...@@ -223,7 +231,7 @@ __device__ void paged_attention_kernel( ...@@ -223,7 +231,7 @@ __device__ void paged_attention_kernel(
// The 0-th thread of each thread group already has its max qk value. // The 0-th thread of each thread group already has its max qk value.
#pragma unroll #pragma unroll
for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) { for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
} }
if (lane == 0) { if (lane == 0) {
red_smem[warp_idx] = qk_max; red_smem[warp_idx] = qk_max;
...@@ -235,10 +243,10 @@ __device__ void paged_attention_kernel( ...@@ -235,10 +243,10 @@ __device__ void paged_attention_kernel(
qk_max = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX; qk_max = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX;
#pragma unroll #pragma unroll
for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) { for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
} }
// Broadcast the max qk value to all threads. // Broadcast the max qk value to all threads.
qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); qk_max = VLLM_SHFL_SYNC(qk_max, 0);
// Get the sum of the exp values. // Get the sum of the exp values.
float exp_sum = 0.f; float exp_sum = 0.f;
...@@ -326,7 +334,7 @@ __device__ void paged_attention_kernel( ...@@ -326,7 +334,7 @@ __device__ void paged_attention_kernel(
float acc = accs[i]; float acc = accs[i];
#pragma unroll #pragma unroll
for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) { for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
acc += __shfl_xor_sync(uint32_t(-1), acc, mask); acc += VLLM_SHFL_XOR_SYNC(acc, mask);
} }
accs[i] = acc; accs[i] = acc;
} }
...@@ -492,7 +500,7 @@ __global__ void paged_attention_v2_reduce_kernel( ...@@ -492,7 +500,7 @@ __global__ void paged_attention_v2_reduce_kernel(
// Reduce within the warp. // Reduce within the warp.
#pragma unroll #pragma unroll
for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) { for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
max_logit = fmaxf(max_logit, __shfl_xor_sync(uint32_t(-1), max_logit, mask)); max_logit = fmaxf(max_logit, VLLM_SHFL_XOR_SYNC(max_logit, mask));
} }
if (lane == 0) { if (lane == 0) {
red_smem[warp_idx] = max_logit; red_smem[warp_idx] = max_logit;
...@@ -502,10 +510,10 @@ __global__ void paged_attention_v2_reduce_kernel( ...@@ -502,10 +510,10 @@ __global__ void paged_attention_v2_reduce_kernel(
max_logit = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX; max_logit = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX;
#pragma unroll #pragma unroll
for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) { for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
max_logit = fmaxf(max_logit, __shfl_xor_sync(uint32_t(-1), max_logit, mask)); max_logit = fmaxf(max_logit, VLLM_SHFL_XOR_SYNC(max_logit, mask));
} }
// Broadcast the max value to all threads. // Broadcast the max value to all threads.
max_logit = __shfl_sync(uint32_t(-1), max_logit, 0); max_logit = VLLM_SHFL_SYNC(max_logit, 0);
// Load rescaled exp sums to shared memory. // Load rescaled exp sums to shared memory.
float* shared_exp_sums = reinterpret_cast<float*>(shared_mem + sizeof(float) * num_partitions); float* shared_exp_sums = reinterpret_cast<float*>(shared_mem + sizeof(float) * num_partitions);
...@@ -539,9 +547,9 @@ __global__ void paged_attention_v2_reduce_kernel( ...@@ -539,9 +547,9 @@ __global__ void paged_attention_v2_reduce_kernel(
} // namespace vllm } // namespace vllm
#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \ #define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \
cudaFuncSetAttribute( \ VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \
vllm::paged_attention_v1_kernel<T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS>, \ ((void*)vllm::paged_attention_v1_kernel<T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS>), \
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem_size); \ shared_mem_size); \
vllm::paged_attention_v1_kernel<T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS> \ vllm::paged_attention_v1_kernel<T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS> \
<<<grid, block, shared_mem_size, stream>>>( \ <<<grid, block, shared_mem_size, stream>>>( \
out_ptr, \ out_ptr, \
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
*/ */
#pragma once #pragma once
#include "../cuda_compat.h"
#include "attention_dtypes.h" #include "attention_dtypes.h"
#include <float.h> #include <float.h>
...@@ -39,7 +40,7 @@ inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) { ...@@ -39,7 +40,7 @@ inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
float qk = sum(qk_vec); float qk = sum(qk_vec);
#pragma unroll #pragma unroll
for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) { for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) {
qk += __shfl_xor_sync(uint32_t(-1), qk, mask); qk += VLLM_SHFL_XOR_SYNC(qk, mask);
} }
return qk; return qk;
} }
......
...@@ -21,8 +21,17 @@ ...@@ -21,8 +21,17 @@
#include "attention_generic.cuh" #include "attention_generic.cuh"
#include "dtype_float32.cuh" #include "dtype_float32.cuh"
#include <cuda_bf16.h> #ifndef USE_ROCM
#include <cuda_fp16.h> #include <cuda_bf16.h>
#include <cuda_fp16.h>
#else
#include <hip/hip_bf16.h>
#include <hip/hip_fp16.h>
typedef __hip_bfloat162 __nv_bfloat162;
typedef __hip_bfloat16 __nv_bfloat16;
#endif
#include <stdint.h> #include <stdint.h>
namespace vllm { namespace vllm {
...@@ -98,7 +107,11 @@ inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b) { ...@@ -98,7 +107,11 @@ inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
assert(false); assert(false);
#else #else
return a + b; #ifndef USE_ROCM
return a + b;
#else
return __hadd(a, b);
#endif
#endif #endif
} }
......
...@@ -21,6 +21,10 @@ ...@@ -21,6 +21,10 @@
#include "attention_generic.cuh" #include "attention_generic.cuh"
#include "dtype_float32.cuh" #include "dtype_float32.cuh"
#ifdef USE_ROCM
#include <hip/hip_fp16.h>
#endif
#include <stdint.h> #include <stdint.h>
namespace vllm { namespace vllm {
...@@ -63,21 +67,47 @@ struct FloatVec<uint4> { ...@@ -63,21 +67,47 @@ struct FloatVec<uint4> {
// Utility functions for type conversions. // Utility functions for type conversions.
inline __device__ uint32_t h0_h0(uint16_t a) { inline __device__ uint32_t h0_h0(uint16_t a) {
#ifndef USE_ROCM
uint32_t b; uint32_t b;
asm volatile("mov.b32 %0, {%1, %1};" : "=r"(b) : "h"(a)); asm volatile("mov.b32 %0, {%1, %1};" : "=r"(b) : "h"(a));
return b; return b;
#else
union {
uint32_t u32;
uint16_t u16[2];
} tmp;
tmp.u16[0] = a;
tmp.u16[1] = a;
return tmp.u32;
#endif
} }
inline __device__ float half_to_float(uint16_t h) { inline __device__ float half_to_float(uint16_t h) {
float f; float f;
#ifndef USE_ROCM
asm volatile("cvt.f32.f16 %0, %1;\n" : "=f"(f) : "h"(h)); asm volatile("cvt.f32.f16 %0, %1;\n" : "=f"(f) : "h"(h));
#else
asm volatile("v_cvt_f32_f16 %0, %1;" : "=v"(f) : "v"(h));
#endif
return f; return f;
} }
inline __device__ float2 half2_to_float2(uint32_t v) { inline __device__ float2 half2_to_float2(uint32_t v) {
#ifndef USE_ROCM
uint16_t lo, hi; uint16_t lo, hi;
asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(v)); asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(v));
return make_float2(half_to_float(lo), half_to_float(hi)); return make_float2(half_to_float(lo), half_to_float(hi));
#else
union {
uint32_t u32;
uint16_t u16[2];
} tmp;
tmp.u32 = v;
float2 ret;
ret.x = half_to_float(tmp.u16[0]);
ret.y = half_to_float(tmp.u16[1]);
return ret;
#endif
} }
inline __device__ uint16_t float_to_half(float f) { inline __device__ uint16_t float_to_half(float f) {
...@@ -85,7 +115,11 @@ inline __device__ uint16_t float_to_half(float f) { ...@@ -85,7 +115,11 @@ inline __device__ uint16_t float_to_half(float f) {
uint32_t u32; uint32_t u32;
uint16_t u16[2]; uint16_t u16[2];
} tmp; } tmp;
#ifndef USE_ROCM
asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f)); asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f));
#else
asm volatile("v_cvt_f16_f32 %0, %1;\n" : "=v"(tmp.u32) : "v"(f));
#endif
return tmp.u16[0]; return tmp.u16[0];
} }
...@@ -94,12 +128,16 @@ inline __device__ uint32_t float2_to_half2(float2 f) { ...@@ -94,12 +128,16 @@ inline __device__ uint32_t float2_to_half2(float2 f) {
uint32_t u32; uint32_t u32;
uint16_t u16[2]; uint16_t u16[2];
} tmp; } tmp;
#ifndef USE_ROCM
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(f.y), "f"(f.x)); asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(f.y), "f"(f.x));
#else
asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
#endif
#else #else
asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x)); tmp.u16[0] = float_to_half(f.x);
asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y)); tmp.u16[1] = float_to_half(f.y);
#endif #endif
return tmp.u32; return tmp.u32;
} }
...@@ -107,13 +145,21 @@ inline __device__ uint32_t float2_to_half2(float2 f) { ...@@ -107,13 +145,21 @@ inline __device__ uint32_t float2_to_half2(float2 f) {
// Vector addition. // Vector addition.
inline __device__ uint16_t add(uint16_t a, uint16_t b) { inline __device__ uint16_t add(uint16_t a, uint16_t b) {
uint16_t c; uint16_t c;
#ifndef USE_ROCM
asm volatile("add.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b)); asm volatile("add.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
#else
asm volatile("v_add_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
#endif
return c; return c;
} }
inline __device__ uint32_t add(uint32_t a, uint32_t b) { inline __device__ uint32_t add(uint32_t a, uint32_t b) {
uint32_t c; uint32_t c;
#ifndef USE_ROCM
asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b)); asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
#else
asm volatile("v_pk_add_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
#endif
return c; return c;
} }
...@@ -158,14 +204,22 @@ inline __device__ Float8_ add(uint4 a, Float8_ fb) { ...@@ -158,14 +204,22 @@ inline __device__ Float8_ add(uint4 a, Float8_ fb) {
template<> template<>
inline __device__ uint16_t mul(uint16_t a, uint16_t b) { inline __device__ uint16_t mul(uint16_t a, uint16_t b) {
uint16_t c; uint16_t c;
#ifndef USE_ROCM
asm volatile("mul.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b)); asm volatile("mul.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
#else
asm volatile("v_mul_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
#endif
return c; return c;
} }
template<> template<>
inline __device__ uint32_t mul(uint32_t a, uint32_t b) { inline __device__ uint32_t mul(uint32_t a, uint32_t b) {
uint32_t c; uint32_t c;
#ifndef USE_ROCM
asm volatile("mul.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b)); asm volatile("mul.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
#else
asm volatile("v_pk_mul_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
#endif
return c; return c;
} }
...@@ -272,7 +326,11 @@ inline __device__ Float8_ mul(uint16_t a, uint4 b) { ...@@ -272,7 +326,11 @@ inline __device__ Float8_ mul(uint16_t a, uint4 b) {
// Vector fused multiply-add. // Vector fused multiply-add.
inline __device__ uint32_t fma(uint32_t a, uint32_t b, uint32_t c) { inline __device__ uint32_t fma(uint32_t a, uint32_t b, uint32_t c) {
uint32_t d; uint32_t d;
#ifndef USE_ROCM
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "r"(b), "r"(c)); asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "r"(b), "r"(c));
#else
asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n" : "=v"(d) : "v"(a), "v"(b), "v"(c));
#endif
return d; return d;
} }
......
#include <torch/extension.h> #include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include "cuda_compat.h"
#include "dispatch_utils.h" #include "dispatch_utils.h"
#include <algorithm> #include <algorithm>
...@@ -28,8 +29,8 @@ void swap_blocks( ...@@ -28,8 +29,8 @@ void swap_blocks(
TORCH_CHECK(false, "Invalid device combination"); TORCH_CHECK(false, "Invalid device combination");
} }
void *src_ptr = src.data_ptr(); char *src_ptr = static_cast<char*>(src.data_ptr());
void *dst_ptr = dst.data_ptr(); char *dst_ptr = static_cast<char*>(dst.data_ptr());
const int64_t block_size_in_bytes = src.element_size() * src[0].numel(); const int64_t block_size_in_bytes = src.element_size() * src[0].numel();
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
...@@ -267,8 +268,8 @@ __global__ void gather_cached_kv_kernel( ...@@ -267,8 +268,8 @@ __global__ void gather_cached_kv_kernel(
+ head_offset * block_size + head_offset * block_size
+ block_offset; + block_offset;
key[tgt_key_idx] = __ldg(&key_cache[src_key_idx]); key[tgt_key_idx] = VLLM_LDG(&key_cache[src_key_idx]);
value[tgt_value_idx] = __ldg(&value_cache[src_value_idx]); value[tgt_value_idx] = VLLM_LDG(&value_cache[src_value_idx]);
} }
} }
...@@ -333,8 +334,8 @@ __global__ void gather_cached_kv_kernel_optimized( ...@@ -333,8 +334,8 @@ __global__ void gather_cached_kv_kernel_optimized(
src_key_indices[j] = src_key_idx; src_key_indices[j] = src_key_idx;
src_value_indices[j] = src_value_idx; src_value_indices[j] = src_value_idx;
keys_to_store[j] = __ldg(&key_cache[src_key_idx]); keys_to_store[j] = VLLM_LDG(&key_cache[src_key_idx]);
values_to_store[j] = __ldg(&value_cache[src_value_idx]); values_to_store[j] = VLLM_LDG(&value_cache[src_value_idx]);
} }
#pragma unroll #pragma unroll
......
#pragma once
#ifndef USE_ROCM
#define VLLM_LDG(arg) __ldg(arg)
#else
#define VLLM_LDG(arg) *(arg)
#endif
#ifndef USE_ROCM
#define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor_sync(uint32_t(-1), var, lane_mask)
#else
#define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
#endif
#ifndef USE_ROCM
#define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane)
#else
#define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane)
#endif
#ifndef USE_ROCM
#define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
cudaFuncSetAttribute(FUNC, cudaFuncAttributeMaxDynamicSharedMemorySize, VAL)
#else
#define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL)
#endif
#ifdef USE_ROCM
#include <hip/hip_runtime.h>
#endif
int get_device_attribute( int get_device_attribute(
int attribute, int attribute,
int device_id) int device_id)
......
...@@ -61,12 +61,14 @@ void gelu_fast( ...@@ -61,12 +61,14 @@ void gelu_fast(
torch::Tensor& out, torch::Tensor& out,
torch::Tensor& input); torch::Tensor& input);
#ifndef USE_ROCM
torch::Tensor awq_gemm( torch::Tensor awq_gemm(
torch::Tensor _in_feats, torch::Tensor _in_feats,
torch::Tensor _kernel, torch::Tensor _kernel,
torch::Tensor _scaling_factors, torch::Tensor _scaling_factors,
torch::Tensor _zeros, torch::Tensor _zeros,
int split_k_iters); int split_k_iters);
#endif
void squeezellm_gemm( void squeezellm_gemm(
torch::Tensor vec, torch::Tensor vec,
......
#include <torch/extension.h> #include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include "cuda_compat.h"
#include "dispatch_utils.h" #include "dispatch_utils.h"
namespace vllm { namespace vllm {
...@@ -19,14 +20,14 @@ inline __device__ void apply_rotary_embedding( ...@@ -19,14 +20,14 @@ inline __device__ void apply_rotary_embedding(
// GPT-NeoX style rotary embedding. // GPT-NeoX style rotary embedding.
x_index = rot_offset; x_index = rot_offset;
y_index = embed_dim + rot_offset; y_index = embed_dim + rot_offset;
cos = __ldg(cos_ptr + x_index); cos = VLLM_LDG(cos_ptr + x_index);
sin = __ldg(sin_ptr + x_index); sin = VLLM_LDG(sin_ptr + x_index);
} else { } else {
// GPT-J style rotary embedding. // GPT-J style rotary embedding.
x_index = 2 * rot_offset; x_index = 2 * rot_offset;
y_index = 2 * rot_offset + 1; y_index = 2 * rot_offset + 1;
cos = __ldg(cos_ptr + x_index / 2); cos = VLLM_LDG(cos_ptr + x_index / 2);
sin = __ldg(sin_ptr + x_index / 2); sin = VLLM_LDG(sin_ptr + x_index / 2);
} }
const scalar_t x = arr[x_index]; const scalar_t x = arr[x_index];
......
...@@ -48,8 +48,12 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { ...@@ -48,8 +48,12 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
&rotary_embedding, &rotary_embedding,
"Apply GPT-NeoX or GPT-J style rotary embedding to query and key"); "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
#ifndef USE_ROCM
// Quantization ops // Quantization ops
ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ"); ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
#endif
ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM"); ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
// Cache ops // Cache ops
......
...@@ -20,9 +20,17 @@ __device__ inline unsigned int as_unsigned(int i) { ...@@ -20,9 +20,17 @@ __device__ inline unsigned int as_unsigned(int i) {
// 4-bit matvec kernel (LUT-based) // 4-bit matvec kernel (LUT-based)
__global__ void NUQ4MatMulKernel( __global__ void NUQ4MatMulKernel(
#ifndef USE_ROCM
const half2* __restrict__ vec, const half2* __restrict__ vec,
#else
const __half2* __restrict__ vec,
#endif
const int* __restrict__ mat, const int* __restrict__ mat,
#ifndef USE_ROCM
half2* __restrict__ mul, half2* __restrict__ mul,
#else
float2* __restrict__ mul,
#endif
const __half* __restrict__ lookup_table, const __half* __restrict__ lookup_table,
int height, int height,
int width, int width,
...@@ -35,7 +43,11 @@ __global__ void NUQ4MatMulKernel( ...@@ -35,7 +43,11 @@ __global__ void NUQ4MatMulKernel(
int row = BLOCKHEIGHT4 * blockIdx.x; int row = BLOCKHEIGHT4 * blockIdx.x;
int col = BLOCKWIDTH * blockIdx.y + threadIdx.x; int col = BLOCKWIDTH * blockIdx.y + threadIdx.x;
#ifndef USE_ROCM
__shared__ half2 blockvec[blockwidth2]; __shared__ half2 blockvec[blockwidth2];
#else
__shared__ __half2 blockvec[blockwidth2];
#endif
__shared__ __half deq2[16][BLOCKWIDTH]; __shared__ __half deq2[16][BLOCKWIDTH];
int off = threadIdx.x; int off = threadIdx.x;
...@@ -46,8 +58,13 @@ __global__ void NUQ4MatMulKernel( ...@@ -46,8 +58,13 @@ __global__ void NUQ4MatMulKernel(
} }
__half res; __half res;
#ifndef USE_ROCM
half2 res2; half2 res2;
half2 tmp2; half2 tmp2;
#else
__half2 res2;
__half2 tmp2;
#endif
int i; int i;
int k; int k;
...@@ -68,48 +85,96 @@ __global__ void NUQ4MatMulKernel( ...@@ -68,48 +85,96 @@ __global__ void NUQ4MatMulKernel(
while (k < blockwidth2) { while (k < blockwidth2) {
tmp1 = as_unsigned(mat[i]); tmp1 = as_unsigned(mat[i]);
#ifndef USE_ROCM
res2 = {}; res2 = {};
tmp2 = {}; tmp2 = {};
#else
res2.x = __half_as_ushort(__float2half(0));
res2.y = __half_as_ushort(__float2half(0));
tmp2.x = __half_as_ushort(__float2half(0));
tmp2.y = __half_as_ushort(__float2half(0));
#endif
lut_index1 = tmp1 & 0xF; lut_index1 = tmp1 & 0xF;
lut_index2 = (tmp1 >> 4) & 0xF; lut_index2 = (tmp1 >> 4) & 0xF;
#ifndef USE_ROCM
tmp2.x = deq2[lut_index1][off]; tmp2.x = deq2[lut_index1][off];
tmp2.y = deq2[lut_index2][off]; tmp2.y = deq2[lut_index2][off];
#else
tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
#endif
res2 = __hfma2(tmp2, blockvec[k + 0], res2); res2 = __hfma2(tmp2, blockvec[k + 0], res2);
lut_index1 = (tmp1 >> 8) & 0xF; lut_index1 = (tmp1 >> 8) & 0xF;
lut_index2 = (tmp1 >> 12) & 0xF; lut_index2 = (tmp1 >> 12) & 0xF;
#ifndef USE_ROCM
tmp2.x = deq2[lut_index1][off]; tmp2.x = deq2[lut_index1][off];
tmp2.y = deq2[lut_index2][off]; tmp2.y = deq2[lut_index2][off];
#else
tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
#endif
res2 = __hfma2(tmp2, blockvec[k + 1], res2); res2 = __hfma2(tmp2, blockvec[k + 1], res2);
lut_index1 = (tmp1 >> 16) & 0xF; lut_index1 = (tmp1 >> 16) & 0xF;
lut_index2 = (tmp1 >> 20) & 0xF; lut_index2 = (tmp1 >> 20) & 0xF;
#ifndef USE_ROCM
tmp2.x = deq2[lut_index1][off]; tmp2.x = deq2[lut_index1][off];
tmp2.y = deq2[lut_index2][off]; tmp2.y = deq2[lut_index2][off];
#else
tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
#endif
res2 = __hfma2(tmp2, blockvec[k + 2], res2); res2 = __hfma2(tmp2, blockvec[k + 2], res2);
lut_index1 = (tmp1 >> 24) & 0xF; lut_index1 = (tmp1 >> 24) & 0xF;
lut_index2 = (tmp1 >> 28) & 0xF; lut_index2 = (tmp1 >> 28) & 0xF;
#ifndef USE_ROCM
tmp2.x = deq2[lut_index1][off]; tmp2.x = deq2[lut_index1][off];
tmp2.y = deq2[lut_index2][off]; tmp2.y = deq2[lut_index2][off];
#else
tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
#endif
res2 = __hfma2(tmp2, blockvec[k + 3], res2); res2 = __hfma2(tmp2, blockvec[k + 3], res2);
#ifndef USE_ROCM
res = __hadd(__hadd(res2.x, res2.y), res); res = __hadd(__hadd(res2.x, res2.y), res);
#else
res = __hadd(__hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y)), res);
#endif
i += width; i += width;
k += 4; k += 4;
} }
// col%2 -> only set one of the two values // col%2 -> only set one of the two values
#ifndef USE_ROCM
half2 res3 = {}; half2 res3 = {};
if (col % 2 == 0) { if (col % 2 == 0) {
res3.x = res; res3.x = res;
} else { } else {
res3.y = res; res3.y = res;
} }
#else
__half2 res3;
res3.x = __half_as_ushort(__float2half(0));
res3.y = __half_as_ushort(__float2half(0));
if (col % 2 == 0) {
res3.x = __half_as_ushort(res);
} else {
res3.y = __half_as_ushort(res);
}
#endif
#ifndef USE_ROCM
atomicAdd(&mul[b * width / 2 + col / 2], res3); atomicAdd(&mul[b * width / 2 + col / 2], res3);
#else
int tmp_addr = b * width / 2 + col / 2;
atomicAdd(&(mul[tmp_addr].x), __half2float(__ushort_as_half(res3.x)));
atomicAdd(&(mul[tmp_addr].y), __half2float(__ushort_as_half(res3.y)));
#endif
} }
} }
...@@ -136,10 +201,19 @@ void squeezellm_gemm( ...@@ -136,10 +201,19 @@ void squeezellm_gemm(
dim3 threads(BLOCKWIDTH); dim3 threads(BLOCKWIDTH);
vllm::squeezellm::NUQ4MatMulKernel<<<blocks, threads>>>( vllm::squeezellm::NUQ4MatMulKernel<<<blocks, threads>>>(
#ifndef USE_ROCM
(half2*) vec.data<at::Half>(), (half2*) vec.data<at::Half>(),
#else
(__half2*) vec.data_ptr<at::Half>(),
#endif
mat.data_ptr<int>(), mat.data_ptr<int>(),
#ifndef USE_ROCM
(half2*) mul.data<at::Half>(), (half2*) mul.data<at::Half>(),
(__half*) lookup_table.data<at::Half>(), (__half*) lookup_table.data<at::Half>(),
#else
(float2*) mul.data_ptr<float>(),
(__half*) lookup_table.data_ptr<at::Half>(),
#endif
height, width, batch, vec_height height, width, batch, vec_height
); );
} }
......
...@@ -17,13 +17,15 @@ ...@@ -17,13 +17,15 @@
*/ */
#pragma once #pragma once
#include "cuda_compat.h"
namespace vllm { namespace vllm {
template<typename T> template<typename T>
__inline__ __device__ T warpReduceSum(T val) { __inline__ __device__ T warpReduceSum(T val) {
#pragma unroll #pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) for (int mask = 16; mask > 0; mask >>= 1)
val += __shfl_xor_sync(0xffffffff, val, mask, 32); val += VLLM_SHFL_XOR_SYNC(val, mask);
return val; return val;
} }
......
.. _installation_rocm:
Installation with ROCm
======================
vLLM 0.2.x onwards supports model inferencing and serving on AMD GPUs with ROCm.
At the moment AWQ quantization is not supported in ROCm, but SqueezeLLM quantization has been ported.
Data types currently supported in ROCm are FP16 and BF16.
Requirements
------------
* OS: Linux
* Python: 3.8 -- 3.11 (Verified on 3.10)
* GPU: MI200s
* Pytorch 2.0.1/2.1.1/2.2
* ROCm 5.7
Installation options:
#. :ref:`(Recommended) Quick start with vLLM pre-installed in Docker Image <quick_start_docker_rocm>`
#. :ref:`Build from source <build_from_source_rocm>`
#. :ref:`Build from source with docker <build_from_source_docker_rocm>`
.. _quick_start_docker_rocm:
(Recommended) Option 1: Quick start with vLLM pre-installed in Docker Image
---------------------------------------------------------------------------
.. code-block:: console
$ docker pull embeddedllminfo/vllm-rocm:vllm-v0.2.3
$ docker run -it \
--network=host \
--group-add=video \
--ipc=host \
--cap-add=SYS_PTRACE \
--security-opt seccomp=unconfined \
--device /dev/kfd \
--device /dev/dri \
-v <path/to/model>:/app/model \
embeddedllminfo/vllm-rocm \
bash
.. _build_from_source_rocm:
Option 2: Build from source
---------------------------
You can build and install vLLM from source:
0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
- `ROCm <https://rocm.docs.amd.com/en/latest/deploy/linux/index.html>`_
- `Pytorch <https://pytorch.org/>`_
.. code-block:: console
$ pip install torch==2.2.0.dev20231206+rocm5.7 --index-url https://download.pytorch.org/whl/nightly/rocm5.7 # tested version
1. Install `flash attention for ROCm <https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm>`_
Install ROCm's flash attention (v2.0.4) following the instructions from `ROCmSoftwarePlatform/flash-attention <https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm#amd-gpurocm-support>`_
.. note::
- If you are using rocm5.7 with pytorch 2.1.0 onwards, you don't need to apply the `hipify_python.patch`. You can build the ROCm flash attention directly.
- If you fail to install `ROCmSoftwarePlatform/flash-attention`, try cloning from the commit `6fd2f8e572805681cd67ef8596c7e2ce521ed3c6`.
- ROCm's Flash-attention-2 (v2.0.4) does not support sliding windows attention.
- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
2. Setup `xformers==0.0.22.post7` without dependencies, and apply patches to adapt for ROCm flash attention
.. code-block:: console
$ pip install xformers==0.0.22.post7 --no-deps
$ bash patch_xformers-0.0.22.post7.rocm.sh
3. Build vLLM.
.. code-block:: console
$ cd vllm
$ pip install -U -r requirements-rocm.txt
$ python setup.py install # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation
.. _build_from_source_docker_rocm:
Option 3: Build from source with docker
-----------------------------------------------------
You can build and install vLLM from source:
Build a docker image from `Dockerfile.rocm`, and launch a docker container.
.. code-block:: console
$ docker build -f Dockerfile.rocm -t vllm-rocm .
$ docker run -it \
--network=host \
--group-add=video \
--ipc=host \
--cap-add=SYS_PTRACE \
--security-opt seccomp=unconfined \
--device /dev/kfd \
--device /dev/dri \
-v <path/to/model>:/app/model \
vllm-rocm \
bash
Alternatively, if you plan to install vLLM-ROCm on a local machine or start from a fresh docker image (e.g. rocm/pytorch), you can follow the steps below:
0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
- `ROCm <https://rocm.docs.amd.com/en/latest/deploy/linux/index.html>`_
- `Pytorch <https://pytorch.org/>`_
1. Install `flash attention for ROCm <https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm>`_
Install ROCm's flash attention (v2.0.4) following the instructions from `ROCmSoftwarePlatform/flash-attention <https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm#amd-gpurocm-support>`_
.. note::
- If you are using rocm5.7 with pytorch 2.1.0 onwards, you don't need to apply the `hipify_python.patch`. You can build the ROCm flash attention directly.
- If you fail to install `ROCmSoftwarePlatform/flash-attention`, try cloning from the commit `6fd2f8e572805681cd67ef8596c7e2ce521ed3c6`.
- ROCm's Flash-attention-2 (v2.0.4) does not support sliding windows attention.
- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
2. Setup `xformers==0.0.22.post7` without dependencies, and apply patches to adapt for ROCm flash attention
.. code-block:: console
$ pip install xformers==0.0.22.post7 --no-deps
$ bash patch_xformers-0.0.22.post7.rocm.sh
3. Build vLLM.
.. code-block:: console
$ cd vllm
$ pip install -U -r requirements-rocm.txt
$ python setup.py install # This may take 5-10 minutes.
...@@ -39,6 +39,7 @@ vLLM is flexible and easy to use with: ...@@ -39,6 +39,7 @@ vLLM is flexible and easy to use with:
* Tensor parallelism support for distributed inference * Tensor parallelism support for distributed inference
* Streaming outputs * Streaming outputs
* OpenAI-compatible API server * OpenAI-compatible API server
* Support NVIDIA CUDA and AMD ROCm.
For more information, check out the following: For more information, check out the following:
...@@ -56,6 +57,7 @@ Documentation ...@@ -56,6 +57,7 @@ Documentation
:caption: Getting Started :caption: Getting Started
getting_started/installation getting_started/installation
getting_started/amd-installation
getting_started/quickstart getting_started/quickstart
.. toctree:: .. toctree::
......
#!/bin/bash
export XFORMERS_FMHA_FLASH_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.flash.__file__)')
export XFORMERS_FMHA_COMMON_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.common.__file__)')
echo $XFORMERS_FMHA_FLASH_PATH
echo $XFORMERS_FMHA_COMMON_PATH
if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch"; then
echo "Applying patch to ${XFORMERS_FMHA_FLASH_PATH}"
patch -p0 $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch"
echo "Successfully patch ${XFORMERS_FMHA_FLASH_PATH}"
else
echo "${XFORMERS_FMHA_FLASH_PATH} was patched before"
fi
if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch"; then
echo "Applying patch to ${XFORMERS_FMHA_COMMON_PATH}"
patch -p0 $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch"
echo "Successfully patch ${XFORMERS_FMHA_COMMON_PATH}"
else
echo "${XFORMERS_FMHA_COMMON_PATH} was patched before"
fi
ninja # For faster builds.
typing-extensions>=4.8.0
starlette
psutil
ray >= 2.5.1
pandas # Required for Ray data.
pyarrow # Required for Ray data.
sentencepiece # Required for LLaMA tokenizer.
numpy
tokenizers>=0.15.0
huggingface_hub<0.18,>=0.16.4
einops # Required for phi-1_5
transformers >= 4.34.0 # Required for Mistral.
fastapi
uvicorn[standard]
pydantic == 1.10.13 # Required for OpenAI server.
aioprometheus[starlette]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment