Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
dfc6721b
Commit
dfc6721b
authored
Mar 25, 2024
by
Jeremy
Browse files
add support for libcudart.so for CUDA devices (adds Jetson support)
parent
acfa2b94
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
438 additions
and
83 deletions
+438
-83
gpu/gpu.go
gpu/gpu.go
+123
-32
gpu/gpu_info.h
gpu/gpu_info.h
+2
-1
gpu/gpu_info_cudart.c
gpu/gpu_info_cudart.c
+190
-0
gpu/gpu_info_cudart.h
gpu/gpu_info_cudart.h
+59
-0
gpu/gpu_info_nvml.c
gpu/gpu_info_nvml.c
+12
-12
gpu/gpu_info_nvml.h
gpu/gpu_info_nvml.h
+13
-13
llm/generate/gen_common.sh
llm/generate/gen_common.sh
+1
-1
llm/generate/gen_linux.sh
llm/generate/gen_linux.sh
+38
-24
No files found.
gpu/gpu.go
View file @
dfc6721b
...
@@ -23,7 +23,8 @@ import (
...
@@ -23,7 +23,8 @@ import (
)
)
type
handles
struct
{
type
handles
struct
{
cuda
*
C
.
cuda_handle_t
nvml
*
C
.
nvml_handle_t
cudart
*
C
.
cudart_handle_t
}
}
var
gpuMutex
sync
.
Mutex
var
gpuMutex
sync
.
Mutex
...
@@ -33,7 +34,7 @@ var gpuHandles *handles = nil
...
@@ -33,7 +34,7 @@ var gpuHandles *handles = nil
var
CudaComputeMin
=
[
2
]
C
.
int
{
5
,
0
}
var
CudaComputeMin
=
[
2
]
C
.
int
{
5
,
0
}
// Possible locations for the nvidia-ml library
// Possible locations for the nvidia-ml library
var
Cuda
LinuxGlobs
=
[]
string
{
var
Nvml
LinuxGlobs
=
[]
string
{
"/usr/local/cuda/lib64/libnvidia-ml.so*"
,
"/usr/local/cuda/lib64/libnvidia-ml.so*"
,
"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so*"
,
"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so*"
,
"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so*"
,
"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so*"
,
...
@@ -41,49 +42,98 @@ var CudaLinuxGlobs = []string{
...
@@ -41,49 +42,98 @@ var CudaLinuxGlobs = []string{
"/usr/lib/wsl/drivers/*/libnvidia-ml.so*"
,
"/usr/lib/wsl/drivers/*/libnvidia-ml.so*"
,
"/opt/cuda/lib64/libnvidia-ml.so*"
,
"/opt/cuda/lib64/libnvidia-ml.so*"
,
"/usr/lib*/libnvidia-ml.so*"
,
"/usr/lib*/libnvidia-ml.so*"
,
"/usr/local/lib*/libnvidia-ml.so*"
,
"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*"
,
"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*"
,
"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*"
,
"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*"
,
"/usr/local/lib*/libnvidia-ml.so*"
,
// TODO: are these stubs ever valid?
// TODO: are these stubs ever valid?
"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*"
,
"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*"
,
}
}
var
Cuda
WindowsGlobs
=
[]
string
{
var
Nvml
WindowsGlobs
=
[]
string
{
"c:
\\
Windows
\\
System32
\\
nvml.dll"
,
"c:
\\
Windows
\\
System32
\\
nvml.dll"
,
}
}
var
CudartLinuxGlobs
=
[]
string
{
"/usr/local/cuda/lib64/libcudart.so*"
,
"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*"
,
"/usr/lib/x86_64-linux-gnu/libcudart.so*"
,
"/usr/lib/wsl/lib/libcudart.so*"
,
"/usr/lib/wsl/drivers/*/libcudart.so*"
,
"/opt/cuda/lib64/libcudart.so*"
,
"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*"
,
"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*"
,
"/usr/lib/aarch64-linux-gnu/libcudart.so*"
,
"/usr/local/cuda/lib*/libcudart.so*"
,
"/usr/lib*/libcudart.so*"
,
"/usr/local/lib*/libcudart.so*"
,
}
var
CudartWindowsGlobs
=
[]
string
{
"c:
\\
Program Files
\\
NVIDIA GPU Computing Toolkit
\\
CUDA
\\
v*
\\
bin
\\
cudart64_*.dll"
,
}
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
var
CudaTegra
string
=
os
.
Getenv
(
"JETSON_JETPACK"
)
// Note: gpuMutex must already be held
// Note: gpuMutex must already be held
func
initGPUHandles
()
{
func
initGPUHandles
()
{
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
gpuHandles
=
&
handles
{
nil
}
gpuHandles
=
&
handles
{
nil
,
nil
}
var
cudaMgmtName
string
var
nvmlMgmtName
string
var
cudaMgmtPatterns
[]
string
var
nvmlMgmtPatterns
[]
string
var
cudartMgmtName
string
var
cudartMgmtPatterns
[]
string
tmpDir
,
_
:=
PayloadsDir
()
switch
runtime
.
GOOS
{
switch
runtime
.
GOOS
{
case
"windows"
:
case
"windows"
:
cudaMgmtName
=
"nvml.dll"
nvmlMgmtName
=
"nvml.dll"
cudaMgmtPatterns
=
make
([]
string
,
len
(
CudaWindowsGlobs
))
nvmlMgmtPatterns
=
make
([]
string
,
len
(
NvmlWindowsGlobs
))
copy
(
cudaMgmtPatterns
,
CudaWindowsGlobs
)
copy
(
nvmlMgmtPatterns
,
NvmlWindowsGlobs
)
cudartMgmtName
=
"cudart64_*.dll"
localAppData
:=
os
.
Getenv
(
"LOCALAPPDATA"
)
cudartMgmtPatterns
=
[]
string
{
filepath
.
Join
(
localAppData
,
"Programs"
,
"Ollama"
,
cudartMgmtName
)}
cudartMgmtPatterns
=
append
(
cudartMgmtPatterns
,
CudartWindowsGlobs
...
)
case
"linux"
:
case
"linux"
:
cudaMgmtName
=
"libnvidia-ml.so"
nvmlMgmtName
=
"libnvidia-ml.so"
cudaMgmtPatterns
=
make
([]
string
,
len
(
CudaLinuxGlobs
))
nvmlMgmtPatterns
=
make
([]
string
,
len
(
NvmlLinuxGlobs
))
copy
(
cudaMgmtPatterns
,
CudaLinuxGlobs
)
copy
(
nvmlMgmtPatterns
,
NvmlLinuxGlobs
)
cudartMgmtName
=
"libcudart.so*"
if
tmpDir
!=
""
{
// TODO - add "payloads" for subprocess
cudartMgmtPatterns
=
[]
string
{
filepath
.
Join
(
tmpDir
,
"cuda*"
,
cudartMgmtName
)}
}
cudartMgmtPatterns
=
append
(
cudartMgmtPatterns
,
CudartLinuxGlobs
...
)
default
:
default
:
return
return
}
}
slog
.
Info
(
"Detecting GPU type"
)
slog
.
Info
(
"Detecting GPU type"
)
cudaLibPaths
:=
FindGPULibs
(
cudaMgmtName
,
cudaMgmtPatterns
)
cuda
rt
LibPaths
:=
FindGPULibs
(
cuda
rt
MgmtName
,
cuda
rt
MgmtPatterns
)
if
len
(
cudaLibPaths
)
>
0
{
if
len
(
cuda
rt
LibPaths
)
>
0
{
cuda
:=
LoadCUDAMgmt
(
cudaLibPaths
)
cuda
rt
:=
LoadCUDA
RT
Mgmt
(
cuda
rt
LibPaths
)
if
cuda
!=
nil
{
if
cuda
rt
!=
nil
{
slog
.
Info
(
"Nvidia GPU detected"
)
slog
.
Info
(
"Nvidia GPU detected
via cudart
"
)
gpuHandles
.
cuda
=
cuda
gpuHandles
.
cuda
rt
=
cuda
rt
return
return
}
}
}
}
// TODO once we build confidence, remove this and the gpu_info_nvml.[ch] files
nvmlLibPaths
:=
FindGPULibs
(
nvmlMgmtName
,
nvmlMgmtPatterns
)
if
len
(
nvmlLibPaths
)
>
0
{
nvml
:=
LoadNVMLMgmt
(
nvmlLibPaths
)
if
nvml
!=
nil
{
slog
.
Info
(
"Nvidia GPU detected via nvidia-ml"
)
gpuHandles
.
nvml
=
nvml
return
}
}
}
}
func
GetGPUInfo
()
GpuInfo
{
func
GetGPUInfo
()
GpuInfo
{
...
@@ -103,23 +153,42 @@ func GetGPUInfo() GpuInfo {
...
@@ -103,23 +153,42 @@ func GetGPUInfo() GpuInfo {
var
memInfo
C
.
mem_info_t
var
memInfo
C
.
mem_info_t
resp
:=
GpuInfo
{}
resp
:=
GpuInfo
{}
if
gpuHandles
.
cuda
!=
nil
&&
(
cpuVariant
!=
""
||
runtime
.
GOARCH
!=
"amd64"
)
{
if
gpuHandles
.
nvml
!=
nil
&&
(
cpuVariant
!=
""
||
runtime
.
GOARCH
!=
"amd64"
)
{
C
.
cuda_check_vram
(
*
gpuHandles
.
cuda
,
&
memInfo
)
C
.
nvml_check_vram
(
*
gpuHandles
.
nvml
,
&
memInfo
)
if
memInfo
.
err
!=
nil
{
slog
.
Info
(
fmt
.
Sprintf
(
"[nvidia-ml] error looking up NVML GPU memory: %s"
,
C
.
GoString
(
memInfo
.
err
)))
C
.
free
(
unsafe
.
Pointer
(
memInfo
.
err
))
}
else
if
memInfo
.
count
>
0
{
// Verify minimum compute capability
var
cc
C
.
nvml_compute_capability_t
C
.
nvml_compute_capability
(
*
gpuHandles
.
nvml
,
&
cc
)
if
cc
.
err
!=
nil
{
slog
.
Info
(
fmt
.
Sprintf
(
"[nvidia-ml] error looking up NVML GPU compute capability: %s"
,
C
.
GoString
(
cc
.
err
)))
C
.
free
(
unsafe
.
Pointer
(
cc
.
err
))
}
else
if
cc
.
major
>
CudaComputeMin
[
0
]
||
(
cc
.
major
==
CudaComputeMin
[
0
]
&&
cc
.
minor
>=
CudaComputeMin
[
1
])
{
slog
.
Info
(
fmt
.
Sprintf
(
"[nvidia-ml] NVML CUDA Compute Capability detected: %d.%d"
,
cc
.
major
,
cc
.
minor
))
resp
.
Library
=
"cuda"
}
else
{
slog
.
Info
(
fmt
.
Sprintf
(
"[nvidia-ml] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d"
,
cc
.
major
,
cc
.
minor
))
}
}
}
else
if
gpuHandles
.
cudart
!=
nil
&&
(
cpuVariant
!=
""
||
runtime
.
GOARCH
!=
"amd64"
)
{
C
.
cudart_check_vram
(
*
gpuHandles
.
cudart
,
&
memInfo
)
if
memInfo
.
err
!=
nil
{
if
memInfo
.
err
!=
nil
{
slog
.
Info
(
fmt
.
Sprintf
(
"error looking up CUDA GPU memory: %s"
,
C
.
GoString
(
memInfo
.
err
)))
slog
.
Info
(
fmt
.
Sprintf
(
"
[cudart]
error looking up CUDA
RT
GPU memory: %s"
,
C
.
GoString
(
memInfo
.
err
)))
C
.
free
(
unsafe
.
Pointer
(
memInfo
.
err
))
C
.
free
(
unsafe
.
Pointer
(
memInfo
.
err
))
}
else
if
memInfo
.
count
>
0
{
}
else
if
memInfo
.
count
>
0
{
// Verify minimum compute capability
// Verify minimum compute capability
var
cc
C
.
cuda_compute_capability_t
var
cc
C
.
cuda
rt
_compute_capability_t
C
.
cuda_compute_capability
(
*
gpuHandles
.
cuda
,
&
cc
)
C
.
cuda
rt
_compute_capability
(
*
gpuHandles
.
cuda
rt
,
&
cc
)
if
cc
.
err
!=
nil
{
if
cc
.
err
!=
nil
{
slog
.
Info
(
fmt
.
Sprintf
(
"error looking up CUDA
GPU
compute capability: %s"
,
C
.
GoString
(
cc
.
err
)))
slog
.
Info
(
fmt
.
Sprintf
(
"
[cudart]
error looking up CUDA compute capability: %s"
,
C
.
GoString
(
cc
.
err
)))
C
.
free
(
unsafe
.
Pointer
(
cc
.
err
))
C
.
free
(
unsafe
.
Pointer
(
cc
.
err
))
}
else
if
cc
.
major
>
CudaComputeMin
[
0
]
||
(
cc
.
major
==
CudaComputeMin
[
0
]
&&
cc
.
minor
>=
CudaComputeMin
[
1
])
{
}
else
if
cc
.
major
>
CudaComputeMin
[
0
]
||
(
cc
.
major
==
CudaComputeMin
[
0
]
&&
cc
.
minor
>=
CudaComputeMin
[
1
])
{
slog
.
Info
(
fmt
.
Sprintf
(
"CUDA Compute Capability detected: %d.%d"
,
cc
.
major
,
cc
.
minor
))
slog
.
Info
(
fmt
.
Sprintf
(
"
[cudart] CUDART
CUDA Compute Capability detected: %d.%d"
,
cc
.
major
,
cc
.
minor
))
resp
.
Library
=
"cuda"
resp
.
Library
=
"cuda"
}
else
{
}
else
{
slog
.
Info
(
fmt
.
Sprintf
(
"CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d"
,
cc
.
major
,
cc
.
minor
))
slog
.
Info
(
fmt
.
Sprintf
(
"
[cudart]
CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d"
,
cc
.
major
,
cc
.
minor
))
}
}
}
}
}
else
{
}
else
{
...
@@ -176,6 +245,11 @@ func CheckVRAM() (int64, error) {
...
@@ -176,6 +245,11 @@ func CheckVRAM() (int64, error) {
if
overhead
<
gpus
*
1024
*
1024
*
1024
{
if
overhead
<
gpus
*
1024
*
1024
*
1024
{
overhead
=
gpus
*
1024
*
1024
*
1024
overhead
=
gpus
*
1024
*
1024
*
1024
}
}
// Assigning full reported free memory for Tegras due to OS controlled caching.
if
CudaTegra
!=
""
{
// Setting overhead for non-Tegra devices
overhead
=
0
}
avail
:=
int64
(
gpuInfo
.
FreeMemory
-
overhead
)
avail
:=
int64
(
gpuInfo
.
FreeMemory
-
overhead
)
slog
.
Debug
(
fmt
.
Sprintf
(
"%s detected %d devices with %dM available memory"
,
gpuInfo
.
Library
,
gpuInfo
.
DeviceCount
,
avail
/
1024
/
1024
))
slog
.
Debug
(
fmt
.
Sprintf
(
"%s detected %d devices with %dM available memory"
,
gpuInfo
.
Library
,
gpuInfo
.
DeviceCount
,
avail
/
1024
/
1024
))
return
avail
,
nil
return
avail
,
nil
...
@@ -238,15 +312,32 @@ func FindGPULibs(baseLibName string, patterns []string) []string {
...
@@ -238,15 +312,32 @@ func FindGPULibs(baseLibName string, patterns []string) []string {
return
gpuLibPaths
return
gpuLibPaths
}
}
func
LoadCUDAMgmt
(
cudaLibPaths
[]
string
)
*
C
.
cuda_handle_t
{
func
LoadNVMLMgmt
(
nvmlLibPaths
[]
string
)
*
C
.
nvml_handle_t
{
var
resp
C
.
cuda_init_resp_t
var
resp
C
.
nvml_init_resp_t
resp
.
ch
.
verbose
=
getVerboseState
()
for
_
,
libPath
:=
range
nvmlLibPaths
{
lib
:=
C
.
CString
(
libPath
)
defer
C
.
free
(
unsafe
.
Pointer
(
lib
))
C
.
nvml_init
(
lib
,
&
resp
)
if
resp
.
err
!=
nil
{
slog
.
Info
(
fmt
.
Sprintf
(
"Unable to load NVML management library %s: %s"
,
libPath
,
C
.
GoString
(
resp
.
err
)))
C
.
free
(
unsafe
.
Pointer
(
resp
.
err
))
}
else
{
return
&
resp
.
ch
}
}
return
nil
}
func
LoadCUDARTMgmt
(
cudartLibPaths
[]
string
)
*
C
.
cudart_handle_t
{
var
resp
C
.
cudart_init_resp_t
resp
.
ch
.
verbose
=
getVerboseState
()
resp
.
ch
.
verbose
=
getVerboseState
()
for
_
,
libPath
:=
range
cudaLibPaths
{
for
_
,
libPath
:=
range
cuda
rt
LibPaths
{
lib
:=
C
.
CString
(
libPath
)
lib
:=
C
.
CString
(
libPath
)
defer
C
.
free
(
unsafe
.
Pointer
(
lib
))
defer
C
.
free
(
unsafe
.
Pointer
(
lib
))
C
.
cuda_init
(
lib
,
&
resp
)
C
.
cuda
rt
_init
(
lib
,
&
resp
)
if
resp
.
err
!=
nil
{
if
resp
.
err
!=
nil
{
slog
.
Info
(
fmt
.
Sprintf
(
"Unable to load CUDA management library %s: %s"
,
libPath
,
C
.
GoString
(
resp
.
err
)))
slog
.
Info
(
fmt
.
Sprintf
(
"Unable to load
cudart
CUDA management library %s: %s"
,
libPath
,
C
.
GoString
(
resp
.
err
)))
C
.
free
(
unsafe
.
Pointer
(
resp
.
err
))
C
.
free
(
unsafe
.
Pointer
(
resp
.
err
))
}
else
{
}
else
{
return
&
resp
.
ch
return
&
resp
.
ch
...
...
gpu/gpu_info.h
View file @
dfc6721b
...
@@ -52,7 +52,8 @@ void cpu_check_ram(mem_info_t *resp);
...
@@ -52,7 +52,8 @@ void cpu_check_ram(mem_info_t *resp);
}
}
#endif
#endif
#include "gpu_info_cuda.h"
#include "gpu_info_nvml.h"
#include "gpu_info_cudart.h"
#endif // __GPU_INFO_H__
#endif // __GPU_INFO_H__
#endif // __APPLE__
#endif // __APPLE__
\ No newline at end of file
gpu/gpu_info_cudart.c
0 → 100644
View file @
dfc6721b
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
#include <string.h>
#include "gpu_info_cudart.h"
void
cudart_init
(
char
*
cudart_lib_path
,
cudart_init_resp_t
*
resp
)
{
cudartReturn_t
ret
;
resp
->
err
=
NULL
;
const
int
buflen
=
256
;
char
buf
[
buflen
+
1
];
int
i
;
struct
lookup
{
char
*
s
;
void
**
p
;
}
l
[]
=
{
{
"cudaSetDevice"
,
(
void
*
)
&
resp
->
ch
.
cudaSetDevice
},
{
"cudaDeviceSynchronize"
,
(
void
*
)
&
resp
->
ch
.
cudaDeviceSynchronize
},
{
"cudaDeviceReset"
,
(
void
*
)
&
resp
->
ch
.
cudaDeviceReset
},
{
"cudaMemGetInfo"
,
(
void
*
)
&
resp
->
ch
.
cudaMemGetInfo
},
{
"cudaGetDeviceCount"
,
(
void
*
)
&
resp
->
ch
.
cudaGetDeviceCount
},
{
"cudaDeviceGetAttribute"
,
(
void
*
)
&
resp
->
ch
.
cudaDeviceGetAttribute
},
{
"cudaDriverGetVersion"
,
(
void
*
)
&
resp
->
ch
.
cudaDriverGetVersion
},
{
NULL
,
NULL
},
};
resp
->
ch
.
handle
=
LOAD_LIBRARY
(
cudart_lib_path
,
RTLD_LAZY
);
if
(
!
resp
->
ch
.
handle
)
{
char
*
msg
=
LOAD_ERR
();
LOG
(
resp
->
ch
.
verbose
,
"library %s load err: %s
\n
"
,
cudart_lib_path
,
msg
);
snprintf
(
buf
,
buflen
,
"Unable to load %s library to query for Nvidia GPUs: %s"
,
cudart_lib_path
,
msg
);
free
(
msg
);
resp
->
err
=
strdup
(
buf
);
return
;
}
// TODO once we've squashed the remaining corner cases remove this log
LOG
(
resp
->
ch
.
verbose
,
"wiring cudart library functions in %s
\n
"
,
cudart_lib_path
);
for
(
i
=
0
;
l
[
i
].
s
!=
NULL
;
i
++
)
{
// TODO once we've squashed the remaining corner cases remove this log
LOG
(
resp
->
ch
.
verbose
,
"dlsym: %s
\n
"
,
l
[
i
].
s
);
*
l
[
i
].
p
=
LOAD_SYMBOL
(
resp
->
ch
.
handle
,
l
[
i
].
s
);
if
(
!
l
[
i
].
p
)
{
char
*
msg
=
LOAD_ERR
();
LOG
(
resp
->
ch
.
verbose
,
"dlerr: %s
\n
"
,
msg
);
UNLOAD_LIBRARY
(
resp
->
ch
.
handle
);
resp
->
ch
.
handle
=
NULL
;
snprintf
(
buf
,
buflen
,
"symbol lookup for %s failed: %s"
,
l
[
i
].
s
,
msg
);
free
(
msg
);
resp
->
err
=
strdup
(
buf
);
return
;
}
}
ret
=
(
*
resp
->
ch
.
cudaSetDevice
)(
0
);
if
(
ret
!=
CUDART_SUCCESS
)
{
LOG
(
resp
->
ch
.
verbose
,
"cudaSetDevice err: %d
\n
"
,
ret
);
UNLOAD_LIBRARY
(
resp
->
ch
.
handle
);
resp
->
ch
.
handle
=
NULL
;
snprintf
(
buf
,
buflen
,
"cudart init failure: %d"
,
ret
);
resp
->
err
=
strdup
(
buf
);
return
;
}
int
version
=
0
;
cudartDriverVersion_t
driverVersion
;
driverVersion
.
major
=
0
;
driverVersion
.
minor
=
0
;
// Report driver version if we're in verbose mode, ignore errors
ret
=
(
*
resp
->
ch
.
cudaDriverGetVersion
)(
&
version
);
if
(
ret
!=
CUDART_SUCCESS
)
{
LOG
(
resp
->
ch
.
verbose
,
"cudaDriverGetVersion failed: %d
\n
"
,
ret
);
}
else
{
driverVersion
.
major
=
version
/
1000
;
driverVersion
.
minor
=
(
version
-
(
driverVersion
.
major
*
1000
))
/
10
;
LOG
(
resp
->
ch
.
verbose
,
"CUDA driver version: %d-%d
\n
"
,
driverVersion
.
major
,
driverVersion
.
minor
);
}
}
void
cudart_check_vram
(
cudart_handle_t
h
,
mem_info_t
*
resp
)
{
resp
->
err
=
NULL
;
cudartMemory_t
memInfo
=
{
0
,
0
,
0
};
cudartReturn_t
ret
;
const
int
buflen
=
256
;
char
buf
[
buflen
+
1
];
int
i
;
if
(
h
.
handle
==
NULL
)
{
resp
->
err
=
strdup
(
"cudart handle isn't initialized"
);
return
;
}
// cudaGetDeviceCount takes int type, resp-> count is uint
int
deviceCount
;
ret
=
(
*
h
.
cudaGetDeviceCount
)(
&
deviceCount
);
if
(
ret
!=
CUDART_SUCCESS
)
{
snprintf
(
buf
,
buflen
,
"unable to get device count: %d"
,
ret
);
resp
->
err
=
strdup
(
buf
);
return
;
}
else
{
resp
->
count
=
(
unsigned
int
)
deviceCount
;
}
resp
->
total
=
0
;
resp
->
free
=
0
;
for
(
i
=
0
;
i
<
resp
->
count
;
i
++
)
{
ret
=
(
*
h
.
cudaSetDevice
)(
i
);
if
(
ret
!=
CUDART_SUCCESS
)
{
snprintf
(
buf
,
buflen
,
"cudart device failed to initialize"
);
resp
->
err
=
strdup
(
buf
);
return
;
}
ret
=
(
*
h
.
cudaMemGetInfo
)(
&
memInfo
.
free
,
&
memInfo
.
total
);
if
(
ret
!=
CUDART_SUCCESS
)
{
snprintf
(
buf
,
buflen
,
"cudart device memory info lookup failure %d"
,
ret
);
resp
->
err
=
strdup
(
buf
);
return
;
}
LOG
(
h
.
verbose
,
"[%d] CUDA totalMem %lu
\n
"
,
i
,
memInfo
.
total
);
LOG
(
h
.
verbose
,
"[%d] CUDA freeMem %lu
\n
"
,
i
,
memInfo
.
free
);
resp
->
total
+=
memInfo
.
total
;
resp
->
free
+=
memInfo
.
free
;
}
}
void
cudart_compute_capability
(
cudart_handle_t
h
,
cudart_compute_capability_t
*
resp
)
{
resp
->
err
=
NULL
;
resp
->
major
=
0
;
resp
->
minor
=
0
;
int
major
=
0
;
int
minor
=
0
;
cudartReturn_t
ret
;
const
int
buflen
=
256
;
char
buf
[
buflen
+
1
];
int
i
;
if
(
h
.
handle
==
NULL
)
{
resp
->
err
=
strdup
(
"cudart handle not initialized"
);
return
;
}
int
devices
;
ret
=
(
*
h
.
cudaGetDeviceCount
)(
&
devices
);
if
(
ret
!=
CUDART_SUCCESS
)
{
snprintf
(
buf
,
buflen
,
"unable to get cudart device count: %d"
,
ret
);
resp
->
err
=
strdup
(
buf
);
return
;
}
for
(
i
=
0
;
i
<
devices
;
i
++
)
{
ret
=
(
*
h
.
cudaSetDevice
)(
i
);
if
(
ret
!=
CUDART_SUCCESS
)
{
snprintf
(
buf
,
buflen
,
"cudart device failed to initialize"
);
resp
->
err
=
strdup
(
buf
);
return
;
}
ret
=
(
*
h
.
cudaDeviceGetAttribute
)(
&
major
,
cudartDevAttrComputeCapabilityMajor
,
i
);
if
(
ret
!=
CUDART_SUCCESS
)
{
snprintf
(
buf
,
buflen
,
"device compute capability lookup failure %d: %d"
,
i
,
ret
);
resp
->
err
=
strdup
(
buf
);
return
;
}
ret
=
(
*
h
.
cudaDeviceGetAttribute
)(
&
minor
,
cudartDevAttrComputeCapabilityMinor
,
i
);
if
(
ret
!=
CUDART_SUCCESS
)
{
snprintf
(
buf
,
buflen
,
"device compute capability lookup failure %d: %d"
,
i
,
ret
);
resp
->
err
=
strdup
(
buf
);
return
;
}
// Report the lowest major.minor we detect as that limits our compatibility
if
(
resp
->
major
==
0
||
resp
->
major
>
major
)
{
resp
->
major
=
major
;
resp
->
minor
=
minor
;
}
else
if
(
resp
->
major
==
major
&&
resp
->
minor
>
minor
)
{
resp
->
minor
=
minor
;
}
}
}
#endif // __APPLE__
\ No newline at end of file
gpu/gpu_info_cudart.h
0 → 100644
View file @
dfc6721b
#ifndef __APPLE__
#ifndef __GPU_INFO_CUDART_H__
#define __GPU_INFO_CUDART_H__
#include "gpu_info.h"
// Just enough typedef's to dlopen/dlsym for memory information
typedef
enum
cudartReturn_enum
{
CUDART_SUCCESS
=
0
,
CUDART_UNSUPPORTED
=
1
,
// Other values omitted for now...
}
cudartReturn_t
;
typedef
enum
cudartDeviceAttr_enum
{
cudartDevAttrComputeCapabilityMajor
=
75
,
cudartDevAttrComputeCapabilityMinor
=
76
,
}
cudartDeviceAttr_t
;
typedef
void
*
cudartDevice_t
;
// Opaque is sufficient
typedef
struct
cudartMemory_st
{
size_t
total
;
size_t
free
;
size_t
used
;
}
cudartMemory_t
;
typedef
struct
cudartDriverVersion
{
int
major
;
int
minor
;
}
cudartDriverVersion_t
;
typedef
struct
cudart_handle
{
void
*
handle
;
uint16_t
verbose
;
cudartReturn_t
(
*
cudaSetDevice
)(
int
device
);
cudartReturn_t
(
*
cudaDeviceSynchronize
)(
void
);
cudartReturn_t
(
*
cudaDeviceReset
)(
void
);
cudartReturn_t
(
*
cudaMemGetInfo
)(
size_t
*
,
size_t
*
);
cudartReturn_t
(
*
cudaGetDeviceCount
)(
int
*
);
cudartReturn_t
(
*
cudaDeviceGetAttribute
)(
int
*
value
,
cudartDeviceAttr_t
attr
,
int
device
);
cudartReturn_t
(
*
cudaDriverGetVersion
)
(
int
*
driverVersion
);
}
cudart_handle_t
;
typedef
struct
cudart_init_resp
{
char
*
err
;
// If err is non-null handle is invalid
cudart_handle_t
ch
;
}
cudart_init_resp_t
;
typedef
struct
cudart_compute_capability
{
char
*
err
;
int
major
;
int
minor
;
}
cudart_compute_capability_t
;
void
cudart_init
(
char
*
cudart_lib_path
,
cudart_init_resp_t
*
resp
);
void
cudart_check_vram
(
cudart_handle_t
ch
,
mem_info_t
*
resp
);
void
cudart_compute_capability
(
cudart_handle_t
th
,
cudart_compute_capability_t
*
cc
);
#endif // __GPU_INFO_CUDART_H__
#endif // __APPLE__
gpu/gpu_info_
cuda
.c
→
gpu/gpu_info_
nvml
.c
View file @
dfc6721b
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
#include "gpu_info_cuda.h"
#include <string.h>
#include <string.h>
void
cuda_init
(
char
*
cuda_lib_path
,
cuda_init_resp_t
*
resp
)
{
#include "gpu_info_nvml.h"
void
nvml_init
(
char
*
nvml_lib_path
,
nvml_init_resp_t
*
resp
)
{
nvmlReturn_t
ret
;
nvmlReturn_t
ret
;
resp
->
err
=
NULL
;
resp
->
err
=
NULL
;
const
int
buflen
=
256
;
const
int
buflen
=
256
;
...
@@ -30,20 +30,20 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
...
@@ -30,20 +30,20 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
{
NULL
,
NULL
},
{
NULL
,
NULL
},
};
};
resp
->
ch
.
handle
=
LOAD_LIBRARY
(
cuda
_lib_path
,
RTLD_LAZY
);
resp
->
ch
.
handle
=
LOAD_LIBRARY
(
nvml
_lib_path
,
RTLD_LAZY
);
if
(
!
resp
->
ch
.
handle
)
{
if
(
!
resp
->
ch
.
handle
)
{
char
*
msg
=
LOAD_ERR
();
char
*
msg
=
LOAD_ERR
();
LOG
(
resp
->
ch
.
verbose
,
"library %s load err: %s
\n
"
,
cuda
_lib_path
,
msg
);
LOG
(
resp
->
ch
.
verbose
,
"library %s load err: %s
\n
"
,
nvml
_lib_path
,
msg
);
snprintf
(
buf
,
buflen
,
snprintf
(
buf
,
buflen
,
"Unable to load %s library to query for Nvidia GPUs: %s"
,
"Unable to load %s library to query for Nvidia GPUs: %s"
,
cuda
_lib_path
,
msg
);
nvml
_lib_path
,
msg
);
free
(
msg
);
free
(
msg
);
resp
->
err
=
strdup
(
buf
);
resp
->
err
=
strdup
(
buf
);
return
;
return
;
}
}
// TODO once we've squashed the remaining corner cases remove this log
// TODO once we've squashed the remaining corner cases remove this log
LOG
(
resp
->
ch
.
verbose
,
"wiring nvidia management library functions in %s
\n
"
,
cuda
_lib_path
);
LOG
(
resp
->
ch
.
verbose
,
"wiring nvidia management library functions in %s
\n
"
,
nvml
_lib_path
);
for
(
i
=
0
;
l
[
i
].
s
!=
NULL
;
i
++
)
{
for
(
i
=
0
;
l
[
i
].
s
!=
NULL
;
i
++
)
{
// TODO once we've squashed the remaining corner cases remove this log
// TODO once we've squashed the remaining corner cases remove this log
...
@@ -82,7 +82,7 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
...
@@ -82,7 +82,7 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
}
}
}
}
void
cuda
_check_vram
(
cuda
_handle_t
h
,
mem_info_t
*
resp
)
{
void
nvml
_check_vram
(
nvml
_handle_t
h
,
mem_info_t
*
resp
)
{
resp
->
err
=
NULL
;
resp
->
err
=
NULL
;
nvmlDevice_t
device
;
nvmlDevice_t
device
;
nvmlMemory_t
memInfo
=
{
0
};
nvmlMemory_t
memInfo
=
{
0
};
...
@@ -92,7 +92,7 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
...
@@ -92,7 +92,7 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
int
i
;
int
i
;
if
(
h
.
handle
==
NULL
)
{
if
(
h
.
handle
==
NULL
)
{
resp
->
err
=
strdup
(
"nvml handle sn't initialized"
);
resp
->
err
=
strdup
(
"nvml handle
i
sn't initialized"
);
return
;
return
;
}
}
...
@@ -155,15 +155,15 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
...
@@ -155,15 +155,15 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
}
}
}
}
LOG
(
h
.
verbose
,
"[%d] CUDA totalMem %l
lu
\n
"
,
i
,
memInfo
.
total
);
LOG
(
h
.
verbose
,
"[%d] CUDA totalMem %l
d
\n
"
,
i
,
memInfo
.
total
);
LOG
(
h
.
verbose
,
"[%d] CUDA
used
Mem %l
lu
\n
"
,
i
,
memInfo
.
used
);
LOG
(
h
.
verbose
,
"[%d] CUDA
free
Mem %l
d
\n
"
,
i
,
memInfo
.
free
);
resp
->
total
+=
memInfo
.
total
;
resp
->
total
+=
memInfo
.
total
;
resp
->
free
+=
memInfo
.
free
;
resp
->
free
+=
memInfo
.
free
;
}
}
}
}
void
cuda
_compute_capability
(
cuda
_handle_t
h
,
cuda
_compute_capability_t
*
resp
)
{
void
nvml
_compute_capability
(
nvml
_handle_t
h
,
nvml
_compute_capability_t
*
resp
)
{
resp
->
err
=
NULL
;
resp
->
err
=
NULL
;
resp
->
major
=
0
;
resp
->
major
=
0
;
resp
->
minor
=
0
;
resp
->
minor
=
0
;
...
...
gpu/gpu_info_
cuda
.h
→
gpu/gpu_info_
nvml
.h
View file @
dfc6721b
#ifndef __APPLE__
#ifndef __APPLE__
#ifndef __GPU_INFO_
CUDA
_H__
#ifndef __GPU_INFO_
NVML
_H__
#define __GPU_INFO_
CUDA
_H__
#define __GPU_INFO_
NVML
_H__
#include "gpu_info.h"
#include "gpu_info.h"
// Just enough typedef's to dlopen/dlsym for memory information
// Just enough typedef's to dlopen/dlsym for memory information
...
@@ -20,7 +20,7 @@ typedef enum nvmlBrandType_enum
...
@@ -20,7 +20,7 @@ typedef enum nvmlBrandType_enum
NVML_BRAND_UNKNOWN
=
0
,
NVML_BRAND_UNKNOWN
=
0
,
}
nvmlBrandType_t
;
}
nvmlBrandType_t
;
typedef
struct
cuda
_handle
{
typedef
struct
nvml
_handle
{
void
*
handle
;
void
*
handle
;
uint16_t
verbose
;
uint16_t
verbose
;
nvmlReturn_t
(
*
nvmlInit_v2
)(
void
);
nvmlReturn_t
(
*
nvmlInit_v2
)(
void
);
...
@@ -35,22 +35,22 @@ typedef struct cuda_handle {
...
@@ -35,22 +35,22 @@ typedef struct cuda_handle {
nvmlReturn_t
(
*
nvmlDeviceGetVbiosVersion
)
(
nvmlDevice_t
device
,
char
*
version
,
unsigned
int
length
);
nvmlReturn_t
(
*
nvmlDeviceGetVbiosVersion
)
(
nvmlDevice_t
device
,
char
*
version
,
unsigned
int
length
);
nvmlReturn_t
(
*
nvmlDeviceGetBoardPartNumber
)
(
nvmlDevice_t
device
,
char
*
partNumber
,
unsigned
int
length
);
nvmlReturn_t
(
*
nvmlDeviceGetBoardPartNumber
)
(
nvmlDevice_t
device
,
char
*
partNumber
,
unsigned
int
length
);
nvmlReturn_t
(
*
nvmlDeviceGetBrand
)
(
nvmlDevice_t
device
,
nvmlBrandType_t
*
type
);
nvmlReturn_t
(
*
nvmlDeviceGetBrand
)
(
nvmlDevice_t
device
,
nvmlBrandType_t
*
type
);
}
cuda
_handle_t
;
}
nvml
_handle_t
;
typedef
struct
cuda
_init_resp
{
typedef
struct
nvml
_init_resp
{
char
*
err
;
// If err is non-null handle is invalid
char
*
err
;
// If err is non-null handle is invalid
cuda
_handle_t
ch
;
nvml
_handle_t
ch
;
}
cuda
_init_resp_t
;
}
nvml
_init_resp_t
;
typedef
struct
cuda
_compute_capability
{
typedef
struct
nvml
_compute_capability
{
char
*
err
;
char
*
err
;
int
major
;
int
major
;
int
minor
;
int
minor
;
}
cuda
_compute_capability_t
;
}
nvml
_compute_capability_t
;
void
cuda
_init
(
char
*
cuda
_lib_path
,
cuda
_init_resp_t
*
resp
);
void
nvml
_init
(
char
*
nvml
_lib_path
,
nvml
_init_resp_t
*
resp
);
void
cuda
_check_vram
(
cuda
_handle_t
ch
,
mem_info_t
*
resp
);
void
nvml
_check_vram
(
nvml
_handle_t
ch
,
mem_info_t
*
resp
);
void
cuda
_compute_capability
(
cuda
_handle_t
ch
,
cuda
_compute_capability_t
*
cc
);
void
nvml
_compute_capability
(
nvml
_handle_t
ch
,
nvml
_compute_capability_t
*
cc
);
#endif // __GPU_INFO_
CUDA
_H__
#endif // __GPU_INFO_
NVML
_H__
#endif // __APPLE__
#endif // __APPLE__
\ No newline at end of file
llm/generate/gen_common.sh
View file @
dfc6721b
...
@@ -39,7 +39,7 @@ init_vars() {
...
@@ -39,7 +39,7 @@ init_vars() {
*
)
*
)
;;
;;
esac
esac
if
[
-z
"
${
CMAKE_CUDA_ARCHITECTURES
}
"
]
;
then
if
[
-z
"
${
CMAKE_CUDA_ARCHITECTURES
}
"
]
;
then
CMAKE_CUDA_ARCHITECTURES
=
"50;52;61;70;75;80"
CMAKE_CUDA_ARCHITECTURES
=
"50;52;61;70;75;80"
fi
fi
}
}
...
...
llm/generate/gen_linux.sh
View file @
dfc6721b
...
@@ -90,30 +90,35 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
...
@@ -90,30 +90,35 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
compress_libs
compress_libs
fi
fi
if
[
-z
"
${
OLLAMA_CPU_TARGET
}
"
-o
"
${
OLLAMA_CPU_TARGET
}
"
=
"
cpu_avx
"
]
;
then
if
[
"
${
ARCH
}
"
=
=
"
x86_64
"
]
;
then
#
#
# ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
# ARM chips in M1/M2/M3-based MACs and NVidia Tegra devices do not currently support avx extensions.
# Approximately 400% faster than LCD on same CPU
#
#
init_vars
if
[
-z
"
${
OLLAMA_CPU_TARGET
}
"
-o
"
${
OLLAMA_CPU_TARGET
}
"
=
"cpu_avx"
]
;
then
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
${
CMAKE_DEFS
}
"
#
BUILD_DIR
=
"
${
LLAMACPP_DIR
}
/build/linux/
${
ARCH
}
/cpu_avx"
# ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
echo
"Building AVX CPU"
# Approximately 400% faster than LCD on same CPU
build
#
compress_libs
init_vars
fi
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
${
CMAKE_DEFS
}
"
BUILD_DIR
=
"
${
LLAMACPP_DIR
}
/build/linux/
${
ARCH
}
/cpu_avx"
if
[
-z
"
${
OLLAMA_CPU_TARGET
}
"
-o
"
${
OLLAMA_CPU_TARGET
}
"
=
"cpu_avx2"
]
;
then
echo
"Building AVX CPU"
#
build
# ~2013 CPU Dynamic library
compress_libs
# Approximately 10% faster than AVX on same CPU
fi
#
init_vars
if
[
-z
"
${
OLLAMA_CPU_TARGET
}
"
-o
"
${
OLLAMA_CPU_TARGET
}
"
=
"cpu_avx2"
]
;
then
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on
${
CMAKE_DEFS
}
"
#
BUILD_DIR
=
"
${
LLAMACPP_DIR
}
/build/linux/
${
ARCH
}
/cpu_avx2"
# ~2013 CPU Dynamic library
echo
"Building AVX2 CPU"
# Approximately 10% faster than AVX on same CPU
build
#
compress_libs
init_vars
CMAKE_DEFS
=
"
${
COMMON_CPU_DEFS
}
-DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on
${
CMAKE_DEFS
}
"
BUILD_DIR
=
"
${
LLAMACPP_DIR
}
/build/linux/
${
ARCH
}
/cpu_avx2"
echo
"Building AVX2 CPU"
build
compress_libs
fi
fi
fi
fi
fi
else
else
...
@@ -142,12 +147,21 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
...
@@ -142,12 +147,21 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
if
[
-n
"
${
CUDA_MAJOR
}
"
]
;
then
if
[
-n
"
${
CUDA_MAJOR
}
"
]
;
then
CUDA_VARIANT
=
_v
${
CUDA_MAJOR
}
CUDA_VARIANT
=
_v
${
CUDA_MAJOR
}
fi
fi
CMAKE_DEFS
=
"-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=
${
CMAKE_CUDA_ARCHITECTURES
}
${
COMMON_CMAKE_DEFS
}
${
CMAKE_DEFS
}
"
if
[
"
${
ARCH
}
"
==
"arm64"
]
;
then
echo
"ARM CPU detected - disabling unsupported AVX instructions"
# ARM-based CPUs such as M1 and Tegra do not support AVX extensions.
#
# CUDA compute < 6.0 lacks proper FP16 support on ARM.
# Disabling has minimal performance effect while maintaining compatibility.
ARM64_DEFS
=
"-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off"
fi
CMAKE_DEFS
=
"-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=
${
CMAKE_CUDA_ARCHITECTURES
}
${
COMMON_CMAKE_DEFS
}
${
CMAKE_DEFS
}
${
ARM64_DEFS
}
"
BUILD_DIR
=
"
${
LLAMACPP_DIR
}
/build/linux/
${
ARCH
}
/cuda
${
CUDA_VARIANT
}
"
BUILD_DIR
=
"
${
LLAMACPP_DIR
}
/build/linux/
${
ARCH
}
/cuda
${
CUDA_VARIANT
}
"
EXTRA_LIBS
=
"-L
${
CUDA_LIB_DIR
}
-lcudart -lcublas -lcublasLt -lcuda"
EXTRA_LIBS
=
"-L
${
CUDA_LIB_DIR
}
-lcudart -lcublas -lcublasLt -lcuda"
build
build
# Cary the CUDA libs as payloads to help reduce dependency burden on users
# Car
r
y the CUDA libs as payloads to help reduce dependency burden on users
#
#
# TODO - in the future we may shift to packaging these separately and conditionally
# TODO - in the future we may shift to packaging these separately and conditionally
# downloading them in the install script.
# downloading them in the install script.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment