Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
bitsandbytes
Commits
ac5550a0
Commit
ac5550a0
authored
May 30, 2023
by
Tim Dettmers
Browse files
Added changes for deployment.
parent
0f40fa3f
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
7 additions
and
15 deletions
+7
-15
Makefile
Makefile
+0
-1
csrc/kernels.cu
csrc/kernels.cu
+7
-3
deploy.sh
deploy.sh
+0
-11
No files found.
Makefile
View file @
ac5550a0
...
...
@@ -33,7 +33,6 @@ COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell
COMPUTE_CAPABILITY
+=
-gencode
arch
=
compute_60,code
=
sm_60
# Pascal
COMPUTE_CAPABILITY
+=
-gencode
arch
=
compute_61,code
=
sm_61
# Pascal
COMPUTE_CAPABILITY
+=
-gencode
arch
=
compute_70,code
=
sm_70
# Volta
COMPUTE_CAPABILITY
+=
-gencode
arch
=
compute_72,code
=
sm_72
# Volta
CC_KEPLER
:=
-gencode
arch
=
compute_35,code
=
sm_35
# Kepler
CC_KEPLER
+=
-gencode
arch
=
compute_37,code
=
sm_37
# Kepler
...
...
csrc/kernels.cu
View file @
ac5550a0
...
...
@@ -16,15 +16,12 @@
#include <thrust/device_vector.h>
#include <mma.h>
#include <cooperative_groups/memcpy_async.h>
#include <cuda/pipeline>
#define HLF_MAX 65504
#define TH 1024
#define NUM 4
#define NUM_BLOCK 4096
using
namespace
nvcuda
;
// source: https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
__device__
float
atomicMax
(
float
*
address
,
float
val
)
{
...
...
@@ -3094,6 +3091,9 @@ template <typename T, typename TCAST, int ITEMS> __device__ inline void vector_l
#define WARPS 5
template
<
typename
T
,
int
BITS
,
int
THREADS
>
__global__
void
gemm_device
(
int
M
,
int
N
,
int
K
,
T
*
__restrict__
const
A
,
T
*
B
,
T
*
out
,
int
lda
,
int
ldb
,
int
ldc
)
{
#if __CUDA_ARCH__ >= 750
using
namespace
nvcuda
;
int
col_offset
=
blockIdx
.
x
*
32
;
const
int
warp_id
=
threadIdx
.
x
/
32
;
const
int
half_warp_id
=
threadIdx
.
x
/
16
;
...
...
@@ -3294,11 +3294,14 @@ template <typename T, int BITS, int THREADS> __global__ void gemm_device(int M,
if
(
col_offset
+
warp_lane
<
M
)
out
[
col_offset
+
warp_lane
]
=
smem_A
[
warp_lane
];
#endif
}
template
<
typename
T
,
int
THREADS
>
__global__
void
kgemm_4bit_inference
(
int
M
,
int
N
,
int
K
,
T
*
__restrict__
const
A
,
unsigned
char
*
B
,
float
*
absmax
,
T
*
out
,
int
lda
,
int
ldb
,
int
ldc
,
int
blocksize
)
{
#if __CUDA_ARCH__ >= 750
using
namespace
nvcuda
;
int
col_offset
=
blockIdx
.
x
*
32
;
const
int
warp_id
=
threadIdx
.
x
/
32
;
const
int
half_warp_id
=
threadIdx
.
x
/
16
;
...
...
@@ -3459,6 +3462,7 @@ template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, i
if
(
col_offset
+
warp_lane
<
M
)
out
[
col_offset
+
warp_lane
]
=
smem_A
[
warp_lane
];
#endif
}
//#define ROWS 2
...
...
deploy.sh
View file @
ac5550a0
...
...
@@ -139,17 +139,6 @@ if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121.so" ]; then
fi
make clean
export
CUDA_HOME
=
$BASE_PATH
/cuda-10.2
make cuda10x_nomatmul
CUDA_VERSION
=
102
if
[
!
-f
"./bitsandbytes/libbitsandbytes_cuda102_nocublaslt.so"
]
;
then
# Control will enter here if $DIRECTORY doesn't exist.
echo
"Compilation unsuccessul!"
1>&2
exit
64
fi
make clean
export
CUDA_HOME
=
$BASE_PATH
/cuda-11.0
make cuda110_nomatmul
CUDA_VERSION
=
110
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment