Added changes for deployment.

ac5550a0 · Tim Dettmers · 0f40fa3f · ac5550a0 · ac5550a0 · ac5550a0
Commit ac5550a0 authored May 30, 2023 by Tim Dettmers
Show whitespace changes
Inline Side-by-side

Showing with 7 additions and 15 deletions

Makefile Makefile +0 -1

csrc/kernels.cu csrc/kernels.cu +7 -3

deploy.sh deploy.sh +0 -11

No files found.
--- a/Makefile
+++ b/Makefile
@@ -33,7 +33,6 @@ COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell
 COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal
 COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal
 COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta
-COMPUTE_CAPABILITY += -gencode arch=compute_72,code=sm_72 # Volta

 CC_KEPLER := -gencode arch=compute_35,code=sm_35 # Kepler
 CC_KEPLER += -gencode arch=compute_37,code=sm_37 # Kepler

--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
@@ -16,15 +16,12 @@
 #include <thrust/device_vector.h>
 #include <mma.h>

-#include <cooperative_groups/memcpy_async.h>
-#include <cuda/pipeline>

 #define HLF_MAX 65504
 #define TH 1024
 #define NUM 4
 #define NUM_BLOCK 4096

-using namespace nvcuda;

 // source: https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
 __device__ float atomicMax(float* address, float val) {
@@ -3094,6 +3091,9 @@ template <typename T, typename TCAST, int ITEMS> __device__ inline void vector_l
 #define WARPS 5
 template <typename T, int BITS, int THREADS> __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A,  T* B,  T * out,  int lda, int ldb, int ldc)
 {
+
+#if __CUDA_ARCH__ >= 750
+	using namespace nvcuda;
  int col_offset = blockIdx.x *32;
  const int warp_id = threadIdx.x / 32;
  const int half_warp_id = threadIdx.x / 16;
@@ -3294,11 +3294,14 @@ template <typename T, int BITS, int THREADS> __global__ void gemm_device(int M,

  if(col_offset + warp_lane < M)
    out[col_offset + warp_lane] = smem_A[warp_lane];
+#endif
 }

 template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, int N, int K, T * __restrict__ const A, unsigned char *B,  float *absmax, T * out,  int lda, int ldb, int ldc, int blocksize)
 {

+#if __CUDA_ARCH__ >= 750
+	using namespace nvcuda;
  int col_offset = blockIdx.x *32;
  const int warp_id = threadIdx.x / 32;
  const int half_warp_id = threadIdx.x / 16;
@@ -3459,6 +3462,7 @@ template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, i

  if(col_offset + warp_lane < M)
    out[col_offset + warp_lane] = smem_A[warp_lane];
+#endif
 }

 //#define ROWS 2

--- a/deploy.sh
+++ b/deploy.sh
@@ -139,17 +139,6 @@ if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121.so" ]; then
 fi


-make clean
-export CUDA_HOME=$BASE_PATH/cuda-10.2
-make cuda10x_nomatmul CUDA_VERSION=102
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda102_nocublaslt.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
-  exit 64
-fi
-
-
 make clean
 export CUDA_HOME=$BASE_PATH/cuda-11.0
 make cuda110_nomatmul CUDA_VERSION=110