Unverified Commit ec3ee028 authored by Yineng Zhang's avatar Yineng Zhang Committed by GitHub
Browse files

fix sgl-kernel cu118 build (#4872)

parent 92941ce7
...@@ -25,5 +25,6 @@ docker run --rm \ ...@@ -25,5 +25,6 @@ docker run --rm \
ln -s /usr/local/cuda-${CUDA_VERSION}/targets/x86_64-linux/lib/stubs/libcuda.so /usr/lib/x86_64-linux-gnu/libcuda.so && \ ln -s /usr/local/cuda-${CUDA_VERSION}/targets/x86_64-linux/lib/stubs/libcuda.so /usr/lib/x86_64-linux-gnu/libcuda.so && \
cd /sgl-kernel && \ cd /sgl-kernel && \
ls -la ${PYTHON_ROOT_PATH}/lib/python${PYTHON_VERSION}/site-packages/wheel/ && \ ls -la ${PYTHON_ROOT_PATH}/lib/python${PYTHON_VERSION}/site-packages/wheel/ && \
PYTHONPATH=${PYTHON_ROOT_PATH}/lib/python${PYTHON_VERSION}/site-packages ${PYTHON_ROOT_PATH}/bin/python -m uv build --wheel -Cbuild-dir=build . --color=always PYTHONPATH=${PYTHON_ROOT_PATH}/lib/python${PYTHON_VERSION}/site-packages ${PYTHON_ROOT_PATH}/bin/python -m uv build --wheel -Cbuild-dir=build . --color=always && \
./rename_wheels.sh
" "
// Adapted from // Adapted from
// https://github.com/vllm-project/vllm/blob/eb59b5a6cba6727d3727c0372258db9002f687c1/csrc/quantization/awq/gemm_kernels.cu#L350 // https://github.com/vllm-project/vllm/blob/eb59b5a6cba6727d3727c0372258db9002f687c1/csrc/quantization/awq/gemm_kernels.cu#L350
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
#include <cuda.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <torch/all.h> #include <torch/all.h>
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
...@@ -79,6 +80,7 @@ __device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source) { ...@@ -79,6 +80,7 @@ __device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source) {
} }
__device__ uint4 dequantize_s4_to_bf16x2(uint32_t const& source) { __device__ uint4 dequantize_s4_to_bf16x2(uint32_t const& source) {
#if CUDA_VERSION >= 12000
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
uint4 result; uint4 result;
uint32_t* h = reinterpret_cast<uint32_t*>(&result); uint32_t* h = reinterpret_cast<uint32_t*>(&result);
...@@ -118,6 +120,7 @@ __device__ uint4 dequantize_s4_to_bf16x2(uint32_t const& source) { ...@@ -118,6 +120,7 @@ __device__ uint4 dequantize_s4_to_bf16x2(uint32_t const& source) {
assert(false); assert(false);
return {}; return {};
#endif #endif
#endif
} }
template <typename OutputT> template <typename OutputT>
...@@ -128,6 +131,7 @@ __global__ void __launch_bounds__(256) dequantize_weights( ...@@ -128,6 +131,7 @@ __global__ void __launch_bounds__(256) dequantize_weights(
OutputT* __restrict__ output, OutputT* __restrict__ output,
int group_size, int group_size,
int qweight_cols) { int qweight_cols) {
#if CUDA_VERSION >= 12000
int col = blockIdx.x * blockDim.x + threadIdx.x; int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y; int row = blockIdx.y * blockDim.y + threadIdx.y;
...@@ -174,6 +178,7 @@ __global__ void __launch_bounds__(256) dequantize_weights( ...@@ -174,6 +178,7 @@ __global__ void __launch_bounds__(256) dequantize_weights(
static_assert(sizeof(uint4) == 8 * sizeof(OutputT), "Memory layout mismatch"); static_assert(sizeof(uint4) == 8 * sizeof(OutputT), "Memory layout mismatch");
*reinterpret_cast<uint4*>(output_ptr) = weight_raw; *reinterpret_cast<uint4*>(output_ptr) = weight_raw;
} }
#endif
} }
torch::Tensor awq_dequantize(torch::Tensor qweight, torch::Tensor scales, torch::Tensor qzeros) { torch::Tensor awq_dequantize(torch::Tensor qweight, torch::Tensor scales, torch::Tensor qzeros) {
......
...@@ -15,6 +15,7 @@ limitations under the License. ...@@ -15,6 +15,7 @@ limitations under the License.
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
#include <cuda.h>
#include <cuda_fp8.h> #include <cuda_fp8.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
...@@ -56,6 +57,7 @@ constexpr int CVT_FP4_SF_VEC_SIZE = 16; ...@@ -56,6 +57,7 @@ constexpr int CVT_FP4_SF_VEC_SIZE = 16;
// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t). // Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) { inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
// PTX instructions used here requires sm100a. // PTX instructions used here requires sm100a.
#if CUDA_VERSION >= 12080
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && __CUDA_ARCH_HAS_FEATURE__(SM100_ALL) #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && __CUDA_ARCH_HAS_FEATURE__(SM100_ALL)
uint32_t val; uint32_t val;
asm volatile( asm volatile(
...@@ -83,11 +85,13 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) { ...@@ -83,11 +85,13 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
#else #else
return 0; return 0;
#endif #endif
#endif
} }
// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t). // Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) { inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
// PTX instructions used here requires sm100a. // PTX instructions used here requires sm100a.
#if CUDA_VERSION >= 12080
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && __CUDA_ARCH_HAS_FEATURE__(SM100_ALL) #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && __CUDA_ARCH_HAS_FEATURE__(SM100_ALL)
uint32_t val; uint32_t val;
asm volatile( asm volatile(
...@@ -115,6 +119,7 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) { ...@@ -115,6 +119,7 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
#else #else
return 0; return 0;
#endif #endif
#endif
} }
// Fast reciprocal. // Fast reciprocal.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment