Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
d6dd2ff8
Unverified
Commit
d6dd2ff8
authored
Apr 01, 2024
by
Michael Yang
Committed by
GitHub
Apr 01, 2024
Browse files
Merge pull request #3241 from ollama/mxyng/mem
update memory estimations for gpu offloading
parents
e57a6ba8
91b3e4d2
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
121 additions
and
85 deletions
+121
-85
format/bytes.go
format/bytes.go
+16
-1
gpu/gpu.go
gpu/gpu.go
+11
-14
gpu/types.go
gpu/types.go
+3
-0
llm/dyn_ext_server.go
llm/dyn_ext_server.go
+2
-2
llm/ggml.go
llm/ggml.go
+11
-0
llm/llm.go
llm/llm.go
+73
-63
server/routes.go
server/routes.go
+5
-5
No files found.
format/bytes.go
View file @
d6dd2ff8
...
@@ -6,11 +6,15 @@ import (
...
@@ -6,11 +6,15 @@ import (
)
)
const
(
const
(
Byte
=
1
Byte
=
1
KiloByte
=
Byte
*
1000
KiloByte
=
Byte
*
1000
MegaByte
=
KiloByte
*
1000
MegaByte
=
KiloByte
*
1000
GigaByte
=
MegaByte
*
1000
GigaByte
=
MegaByte
*
1000
TeraByte
=
GigaByte
*
1000
TeraByte
=
GigaByte
*
1000
KibiByte
=
Byte
*
1024
MebiByte
=
KibiByte
*
1024
)
)
func
HumanBytes
(
b
int64
)
string
{
func
HumanBytes
(
b
int64
)
string
{
...
@@ -45,3 +49,14 @@ func HumanBytes(b int64) string {
...
@@ -45,3 +49,14 @@ func HumanBytes(b int64) string {
return
fmt
.
Sprintf
(
"%d %s"
,
int
(
value
),
unit
)
return
fmt
.
Sprintf
(
"%d %s"
,
int
(
value
),
unit
)
}
}
}
}
func
HumanBytes2
(
b
int64
)
string
{
switch
{
case
b
>=
MebiByte
:
return
fmt
.
Sprintf
(
"%.1f MiB"
,
float64
(
b
)
/
MebiByte
)
case
b
>=
KibiByte
:
return
fmt
.
Sprintf
(
"%.1f KiB"
,
float64
(
b
)
/
KibiByte
)
default
:
return
fmt
.
Sprintf
(
"%d B"
,
b
)
}
}
gpu/gpu.go
View file @
d6dd2ff8
...
@@ -20,6 +20,8 @@ import (
...
@@ -20,6 +20,8 @@ import (
"strings"
"strings"
"sync"
"sync"
"unsafe"
"unsafe"
"github.com/ollama/ollama/format"
)
)
type
handles
struct
{
type
handles
struct
{
...
@@ -27,6 +29,11 @@ type handles struct {
...
@@ -27,6 +29,11 @@ type handles struct {
cudart
*
C
.
cudart_handle_t
cudart
*
C
.
cudart_handle_t
}
}
const
(
cudaMinimumMemory
=
377
*
format
.
MebiByte
rocmMinimumMemory
=
377
*
format
.
MebiByte
)
var
gpuMutex
sync
.
Mutex
var
gpuMutex
sync
.
Mutex
var
gpuHandles
*
handles
=
nil
var
gpuHandles
*
handles
=
nil
...
@@ -168,6 +175,7 @@ func GetGPUInfo() GpuInfo {
...
@@ -168,6 +175,7 @@ func GetGPUInfo() GpuInfo {
}
else
if
cc
.
major
>
CudaComputeMin
[
0
]
||
(
cc
.
major
==
CudaComputeMin
[
0
]
&&
cc
.
minor
>=
CudaComputeMin
[
1
])
{
}
else
if
cc
.
major
>
CudaComputeMin
[
0
]
||
(
cc
.
major
==
CudaComputeMin
[
0
]
&&
cc
.
minor
>=
CudaComputeMin
[
1
])
{
slog
.
Info
(
fmt
.
Sprintf
(
"[nvidia-ml] NVML CUDA Compute Capability detected: %d.%d"
,
cc
.
major
,
cc
.
minor
))
slog
.
Info
(
fmt
.
Sprintf
(
"[nvidia-ml] NVML CUDA Compute Capability detected: %d.%d"
,
cc
.
major
,
cc
.
minor
))
resp
.
Library
=
"cuda"
resp
.
Library
=
"cuda"
resp
.
MinimumMemory
=
cudaMinimumMemory
}
else
{
}
else
{
slog
.
Info
(
fmt
.
Sprintf
(
"[nvidia-ml] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d"
,
cc
.
major
,
cc
.
minor
))
slog
.
Info
(
fmt
.
Sprintf
(
"[nvidia-ml] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d"
,
cc
.
major
,
cc
.
minor
))
}
}
...
@@ -187,6 +195,7 @@ func GetGPUInfo() GpuInfo {
...
@@ -187,6 +195,7 @@ func GetGPUInfo() GpuInfo {
}
else
if
cc
.
major
>
CudaComputeMin
[
0
]
||
(
cc
.
major
==
CudaComputeMin
[
0
]
&&
cc
.
minor
>=
CudaComputeMin
[
1
])
{
}
else
if
cc
.
major
>
CudaComputeMin
[
0
]
||
(
cc
.
major
==
CudaComputeMin
[
0
]
&&
cc
.
minor
>=
CudaComputeMin
[
1
])
{
slog
.
Info
(
fmt
.
Sprintf
(
"[cudart] CUDART CUDA Compute Capability detected: %d.%d"
,
cc
.
major
,
cc
.
minor
))
slog
.
Info
(
fmt
.
Sprintf
(
"[cudart] CUDART CUDA Compute Capability detected: %d.%d"
,
cc
.
major
,
cc
.
minor
))
resp
.
Library
=
"cuda"
resp
.
Library
=
"cuda"
resp
.
MinimumMemory
=
cudaMinimumMemory
}
else
{
}
else
{
slog
.
Info
(
fmt
.
Sprintf
(
"[cudart] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d"
,
cc
.
major
,
cc
.
minor
))
slog
.
Info
(
fmt
.
Sprintf
(
"[cudart] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d"
,
cc
.
major
,
cc
.
minor
))
}
}
...
@@ -194,6 +203,7 @@ func GetGPUInfo() GpuInfo {
...
@@ -194,6 +203,7 @@ func GetGPUInfo() GpuInfo {
}
else
{
}
else
{
AMDGetGPUInfo
(
&
resp
)
AMDGetGPUInfo
(
&
resp
)
if
resp
.
Library
!=
""
{
if
resp
.
Library
!=
""
{
resp
.
MinimumMemory
=
rocmMinimumMemory
return
resp
return
resp
}
}
}
}
...
@@ -239,20 +249,7 @@ func CheckVRAM() (int64, error) {
...
@@ -239,20 +249,7 @@ func CheckVRAM() (int64, error) {
}
}
gpuInfo
:=
GetGPUInfo
()
gpuInfo
:=
GetGPUInfo
()
if
gpuInfo
.
FreeMemory
>
0
&&
(
gpuInfo
.
Library
==
"cuda"
||
gpuInfo
.
Library
==
"rocm"
)
{
if
gpuInfo
.
FreeMemory
>
0
&&
(
gpuInfo
.
Library
==
"cuda"
||
gpuInfo
.
Library
==
"rocm"
)
{
// leave 10% or 1024MiB of VRAM free per GPU to handle unaccounted for overhead
return
int64
(
gpuInfo
.
FreeMemory
),
nil
overhead
:=
gpuInfo
.
FreeMemory
/
10
gpus
:=
uint64
(
gpuInfo
.
DeviceCount
)
if
overhead
<
gpus
*
1024
*
1024
*
1024
{
overhead
=
gpus
*
1024
*
1024
*
1024
}
// Assigning full reported free memory for Tegras due to OS controlled caching.
if
CudaTegra
!=
""
{
// Setting overhead for non-Tegra devices
overhead
=
0
}
avail
:=
int64
(
gpuInfo
.
FreeMemory
-
overhead
)
slog
.
Debug
(
fmt
.
Sprintf
(
"%s detected %d devices with %dM available memory"
,
gpuInfo
.
Library
,
gpuInfo
.
DeviceCount
,
avail
/
1024
/
1024
))
return
avail
,
nil
}
}
return
0
,
fmt
.
Errorf
(
"no GPU detected"
)
// TODO - better handling of CPU based memory determiniation
return
0
,
fmt
.
Errorf
(
"no GPU detected"
)
// TODO - better handling of CPU based memory determiniation
...
...
gpu/types.go
View file @
d6dd2ff8
...
@@ -14,6 +14,9 @@ type GpuInfo struct {
...
@@ -14,6 +14,9 @@ type GpuInfo struct {
// Optional variant to select (e.g. versions, cpu feature flags)
// Optional variant to select (e.g. versions, cpu feature flags)
Variant
string
`json:"variant,omitempty"`
Variant
string
`json:"variant,omitempty"`
// MinimumMemory represents the minimum memory required to use the GPU
MinimumMemory
int64
`json:"-"`
// TODO add other useful attributes about the card here for discovery information
// TODO add other useful attributes about the card here for discovery information
}
}
...
...
llm/dyn_ext_server.go
View file @
d6dd2ff8
...
@@ -39,7 +39,7 @@ import (
...
@@ -39,7 +39,7 @@ import (
type
dynExtServer
struct
{
type
dynExtServer
struct
{
s
C
.
struct_dynamic_llama_server
s
C
.
struct_dynamic_llama_server
options
api
.
Options
options
*
api
.
Options
}
}
// Note: current implementation does not support concurrent instantiations
// Note: current implementation does not support concurrent instantiations
...
@@ -64,7 +64,7 @@ func extServerResponseToErr(resp C.ext_server_resp_t) error {
...
@@ -64,7 +64,7 @@ func extServerResponseToErr(resp C.ext_server_resp_t) error {
return
fmt
.
Errorf
(
C
.
GoString
(
resp
.
msg
))
return
fmt
.
Errorf
(
C
.
GoString
(
resp
.
msg
))
}
}
func
newDynExtServer
(
library
,
model
string
,
adapters
,
projectors
[]
string
,
opts
api
.
Options
)
(
LLM
,
error
)
{
func
newDynExtServer
(
library
,
model
string
,
adapters
,
projectors
[]
string
,
opts
*
api
.
Options
)
(
LLM
,
error
)
{
if
!
mutex
.
TryLock
()
{
if
!
mutex
.
TryLock
()
{
slog
.
Info
(
"concurrent llm servers not yet supported, waiting for prior server to complete"
)
slog
.
Info
(
"concurrent llm servers not yet supported, waiting for prior server to complete"
)
mutex
.
Lock
()
mutex
.
Lock
()
...
...
llm/ggml.go
View file @
d6dd2ff8
...
@@ -5,6 +5,7 @@ import (
...
@@ -5,6 +5,7 @@ import (
"errors"
"errors"
"fmt"
"fmt"
"io"
"io"
"strings"
)
)
type
GGML
struct
{
type
GGML
struct
{
...
@@ -12,6 +13,16 @@ type GGML struct {
...
@@ -12,6 +13,16 @@ type GGML struct {
model
model
}
}
func
(
ggml
*
GGML
)
LayerSize
(
prefix
string
)
(
n
int64
)
{
for
_
,
t
:=
range
ggml
.
Tensors
()
{
if
strings
.
HasPrefix
(
t
.
Name
,
prefix
)
{
n
+=
int64
(
t
.
size
())
}
}
return
}
const
(
const
(
fileTypeF32
uint32
=
iota
fileTypeF32
uint32
=
iota
fileTypeF16
fileTypeF16
...
...
llm/llm.go
View file @
d6dd2ff8
...
@@ -5,10 +5,11 @@ import (
...
@@ -5,10 +5,11 @@ import (
"fmt"
"fmt"
"log/slog"
"log/slog"
"os"
"os"
"runtime"
"slices"
"slices"
"strings"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/gpu"
)
)
...
@@ -24,7 +25,7 @@ var cpuOnlyFamilies = []string{
...
@@ -24,7 +25,7 @@ var cpuOnlyFamilies = []string{
"mamba"
,
"mamba"
,
}
}
func
New
(
model
string
,
adapters
,
projectors
[]
string
,
opts
api
.
Options
)
(
LLM
,
error
)
{
func
New
(
model
string
,
adapters
,
projectors
[]
string
,
opts
*
api
.
Options
)
(
LLM
,
error
)
{
if
_
,
err
:=
os
.
Stat
(
model
);
err
!=
nil
{
if
_
,
err
:=
os
.
Stat
(
model
);
err
!=
nil
{
return
nil
,
err
return
nil
,
err
}
}
...
@@ -35,7 +36,7 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
...
@@ -35,7 +36,7 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
}
}
defer
f
.
Close
()
defer
f
.
Close
()
ggml
,
size
,
err
:=
DecodeGGML
(
f
)
ggml
,
_
,
err
:=
DecodeGGML
(
f
)
if
err
!=
nil
{
if
err
!=
nil
{
return
nil
,
err
return
nil
,
err
}
}
...
@@ -49,84 +50,93 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
...
@@ -49,84 +50,93 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
opts
.
NumCtx
=
4
opts
.
NumCtx
=
4
}
}
vram
,
_
:=
gpu
.
CheckVRAM
()
availableMemory
,
_
:=
gpu
.
CheckVRAM
()
info
:=
gpu
.
GetGPUInfo
()
usedMemory
:=
info
.
MinimumMemory
for
_
,
projector
:=
range
projectors
{
usedMemory
+=
projectorMemoryRequirements
(
projector
)
// multimodal models require at least 2048 context
opts
.
NumCtx
=
max
(
opts
.
NumCtx
,
2048
)
}
// fp16 k,v
matrices require =
n_ctx * n_layer * n_embd / n_head * n_head_kv
* 2 bytes each * 2 key and value
// fp16 k,v
= (1 (k) + 1 (v)) * sizeof(float16) *
n_ctx * n_layer * n_embd / n_head * n_head_kv
kv
:=
2
*
2
*
int64
(
opts
.
NumCtx
)
*
int64
(
ggml
.
KV
()
.
BlockCount
())
*
int64
(
ggml
.
KV
()
.
EmbeddingLength
())
*
int64
(
ggml
.
KV
()
.
HeadCount
KV
())
/
int64
(
ggml
.
KV
()
.
HeadCount
())
kv
:=
2
*
2
*
int64
(
opts
.
NumCtx
)
*
int64
(
ggml
.
KV
()
.
BlockCount
())
*
int64
(
ggml
.
KV
()
.
EmbeddingLength
())
/
int64
(
ggml
.
KV
()
.
HeadCount
())
*
int64
(
ggml
.
KV
()
.
HeadCount
KV
())
// this amount is the overhead + tensors in memory
// this amount is the overhead + tensors in memory
// TODO: get this from the llama.cpp's graph calculations instead of
// TODO: get this from the llama.cpp's graph calculations instead of
// estimating it's 1/6 * kv_cache_size * num_gqa
// estimating it's 1/6 * kv_cache_size * num_gqa
graph
:=
int64
(
ggml
.
KV
()
.
GQA
())
*
kv
/
6
graph
:=
int64
(
ggml
.
KV
()
.
GQA
())
*
kv
/
6
usedMemory
+=
graph
if
slices
.
Contains
(
cpuOnlyFamilies
,
ggml
.
KV
()
.
Architecture
())
{
if
usedMemory
>
availableMemory
||
slices
.
Contains
(
cpuOnlyFamilies
,
ggml
.
KV
()
.
Architecture
())
{
opts
.
NumGPU
=
0
info
.
Library
=
"cpu"
}
}
info
:=
gpu
.
GetGPUInfo
()
requiredMemory
:=
usedMemory
switch
runtime
.
GOOS
{
case
"darwin"
:
if
opts
.
NumGPU
==
0
{
break
}
if
size
+
kv
+
graph
>
vram
{
var
layers
int
slog
.
Info
(
"not enough vram available, setting num_gpu=0"
)
for
i
:=
0
;
i
<
int
(
ggml
.
KV
()
.
BlockCount
());
i
++
{
opts
.
NumGPU
=
0
layerMemory
:=
ggml
.
LayerSize
(
fmt
.
Sprintf
(
"blk.%d."
,
i
))
+
kv
/
int64
(
ggml
.
KV
()
.
BlockCount
())
break
requiredMemory
+=
layerMemory
}
// TODO: implement layer splitting on macOS
if
availableMemory
>
usedMemory
+
layerMemory
&&
(
opts
.
NumGPU
<
0
||
layers
<
opts
.
NumGPU
)
{
opts
.
NumGPU
=
999
usedMemory
+=
layerMemory
default
:
layers
++
if
info
.
Library
==
"cpu"
{
slog
.
Info
(
"GPU not available, falling back to CPU"
)
opts
.
NumGPU
=
0
break
}
}
}
// don't use GPU at all if no layers are loaded
memOutputLayer
:=
ggml
.
LayerSize
(
"output."
)
if
opts
.
NumGPU
==
0
{
requiredMemory
+=
memOutputLayer
info
.
Library
=
"cpu"
info
.
Variant
=
gpu
.
GetCPUVariant
()
break
}
// user-defined GPU count
// only offload output layer if all repeating layers are offloaded
if
opts
.
NumGPU
!=
-
1
{
if
layers
>=
int
(
ggml
.
KV
()
.
BlockCount
())
&&
availableMemory
>
usedMemory
+
memOutputLayer
{
break
usedMemory
+=
memOutputLayer
}
layers
++
}
// the "main" GPU needs the most memory and determines the limit
slog
.
Info
(
// of how many layers can be loaded. It needs to fit:
"offload to gpu"
,
// 1. the full compute graph allocation for all devices (graph)
"layers"
,
layers
,
// 2. the proportional kv cache for all devices (kv * % layers)
"required"
,
format
.
HumanBytes2
(
requiredMemory
),
// 3. the proportional model (size * % layers / # devices)
"used"
,
format
.
HumanBytes2
(
usedMemory
),
// This estimates the number of layers
"available"
,
format
.
HumanBytes2
(
availableMemory
),
maxlayers
:=
int64
(
ggml
.
KV
()
.
BlockCount
())
+
1
"kv"
,
format
.
HumanBytes2
(
kv
),
devices
:=
int64
(
info
.
DeviceCount
)
"graph"
,
format
.
HumanBytes2
(
graph
),
avg
:=
vram
/
devices
)
layers
:=
maxlayers
*
(
avg
-
graph
)
/
(
kv
+
size
/
devices
)
if
layers
>
maxlayers
{
if
opts
.
NumGPU
<
0
&&
info
.
Library
!=
"cpu"
{
layers
=
max
layers
opts
.
NumGPU
=
layers
}
}
// 1 + 2 must fit on the main gpu
return
newLlmServer
(
info
,
model
,
adapters
,
projectors
,
opts
)
min
:=
graph
+
kv
*
layers
/
maxlayers
}
if
layers
<=
0
||
min
>
avg
{
slog
.
Info
(
"not enough vram available, falling back to CPU only"
)
info
.
Library
=
"cpu"
info
.
Variant
=
gpu
.
GetCPUVariant
()
opts
.
NumGPU
=
0
break
}
opts
.
NumGPU
=
int
(
layers
)
func
projectorMemoryRequirements
(
filename
string
)
int64
{
file
,
err
:=
os
.
Open
(
filename
)
if
err
!=
nil
{
return
0
}
}
defer
file
.
Close
()
opts
.
RopeFrequencyBase
=
0.0
ggml
,
_
,
err
:=
DecodeGGML
(
file
)
opts
.
RopeFrequencyScale
=
0.0
if
err
!=
nil
{
return
newLlmServer
(
info
,
model
,
adapters
,
projectors
,
opts
)
return
0
}
prefixes
:=
make
(
map
[
string
]
struct
{})
for
_
,
layer
:=
range
ggml
.
Tensors
()
{
parts
:=
strings
.
Split
(
layer
.
Name
,
"."
)
prefixes
[
strings
.
Join
(
parts
[
:
2
],
"."
)]
=
struct
{}{}
}
var
ask
int64
for
prefix
:=
range
prefixes
{
ask
+=
ggml
.
LayerSize
(
prefix
)
}
return
ask
}
}
// Give any native cgo implementations an opportunity to initialize
// Give any native cgo implementations an opportunity to initialize
...
@@ -134,7 +144,7 @@ func Init() error {
...
@@ -134,7 +144,7 @@ func Init() error {
return
nativeInit
()
return
nativeInit
()
}
}
func
newLlmServer
(
gpuInfo
gpu
.
GpuInfo
,
model
string
,
adapters
,
projectors
[]
string
,
opts
api
.
Options
)
(
LLM
,
error
)
{
func
newLlmServer
(
gpuInfo
gpu
.
GpuInfo
,
model
string
,
adapters
,
projectors
[]
string
,
opts
*
api
.
Options
)
(
LLM
,
error
)
{
dynLibs
:=
getDynLibs
(
gpuInfo
)
dynLibs
:=
getDynLibs
(
gpuInfo
)
// Check to see if the user has requested a specific library instead of auto-detecting
// Check to see if the user has requested a specific library instead of auto-detecting
...
...
server/routes.go
View file @
d6dd2ff8
...
@@ -68,7 +68,7 @@ var loaded struct {
...
@@ -68,7 +68,7 @@ var loaded struct {
var
defaultSessionDuration
=
5
*
time
.
Minute
var
defaultSessionDuration
=
5
*
time
.
Minute
// load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
// load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
func
load
(
c
*
gin
.
Context
,
model
*
Model
,
opts
api
.
Options
,
sessionDuration
time
.
Duration
)
error
{
func
load
(
c
*
gin
.
Context
,
model
*
Model
,
opts
*
api
.
Options
,
sessionDuration
time
.
Duration
)
error
{
needLoad
:=
loaded
.
runner
==
nil
||
// is there a model loaded?
needLoad
:=
loaded
.
runner
==
nil
||
// is there a model loaded?
loaded
.
ModelPath
!=
model
.
ModelPath
||
// has the base model changed?
loaded
.
ModelPath
!=
model
.
ModelPath
||
// has the base model changed?
!
reflect
.
DeepEqual
(
loaded
.
AdapterPaths
,
model
.
AdapterPaths
)
||
// have the adapters changed?
!
reflect
.
DeepEqual
(
loaded
.
AdapterPaths
,
model
.
AdapterPaths
)
||
// have the adapters changed?
...
@@ -97,7 +97,7 @@ func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.D
...
@@ -97,7 +97,7 @@ func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.D
loaded
.
Model
=
model
loaded
.
Model
=
model
loaded
.
runner
=
llmRunner
loaded
.
runner
=
llmRunner
loaded
.
Options
=
&
opts
loaded
.
Options
=
opts
}
}
loaded
.
expireAt
=
time
.
Now
()
.
Add
(
sessionDuration
)
loaded
.
expireAt
=
time
.
Now
()
.
Add
(
sessionDuration
)
...
@@ -214,7 +214,7 @@ func GenerateHandler(c *gin.Context) {
...
@@ -214,7 +214,7 @@ func GenerateHandler(c *gin.Context) {
sessionDuration
=
req
.
KeepAlive
.
Duration
sessionDuration
=
req
.
KeepAlive
.
Duration
}
}
if
err
:=
load
(
c
,
model
,
opts
,
sessionDuration
);
err
!=
nil
{
if
err
:=
load
(
c
,
model
,
&
opts
,
sessionDuration
);
err
!=
nil
{
c
.
JSON
(
http
.
StatusInternalServerError
,
gin
.
H
{
"error"
:
err
.
Error
()})
c
.
JSON
(
http
.
StatusInternalServerError
,
gin
.
H
{
"error"
:
err
.
Error
()})
return
return
}
}
...
@@ -460,7 +460,7 @@ func EmbeddingsHandler(c *gin.Context) {
...
@@ -460,7 +460,7 @@ func EmbeddingsHandler(c *gin.Context) {
sessionDuration
=
req
.
KeepAlive
.
Duration
sessionDuration
=
req
.
KeepAlive
.
Duration
}
}
if
err
:=
load
(
c
,
model
,
opts
,
sessionDuration
);
err
!=
nil
{
if
err
:=
load
(
c
,
model
,
&
opts
,
sessionDuration
);
err
!=
nil
{
c
.
JSON
(
http
.
StatusInternalServerError
,
gin
.
H
{
"error"
:
err
.
Error
()})
c
.
JSON
(
http
.
StatusInternalServerError
,
gin
.
H
{
"error"
:
err
.
Error
()})
return
return
}
}
...
@@ -1267,7 +1267,7 @@ func ChatHandler(c *gin.Context) {
...
@@ -1267,7 +1267,7 @@ func ChatHandler(c *gin.Context) {
sessionDuration
=
req
.
KeepAlive
.
Duration
sessionDuration
=
req
.
KeepAlive
.
Duration
}
}
if
err
:=
load
(
c
,
model
,
opts
,
sessionDuration
);
err
!=
nil
{
if
err
:=
load
(
c
,
model
,
&
opts
,
sessionDuration
);
err
!=
nil
{
c
.
JSON
(
http
.
StatusInternalServerError
,
gin
.
H
{
"error"
:
err
.
Error
()})
c
.
JSON
(
http
.
StatusInternalServerError
,
gin
.
H
{
"error"
:
err
.
Error
()})
return
return
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment