Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
9b5a3c59
Unverified
Commit
9b5a3c59
authored
Apr 25, 2024
by
Daniel Hiltgen
Committed by
GitHub
Apr 25, 2024
Browse files
Merge pull request #3914 from dhiltgen/mac_perf
Improve mac parallel performance
parents
00b0699c
b123be5b
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
56 additions
and
0 deletions
+56
-0
llm/patches/04-metal.diff
llm/patches/04-metal.diff
+45
-0
server/sched.go
server/sched.go
+11
-0
No files found.
llm/patches/04-metal.diff
0 → 100644
View file @
9b5a3c59
diff --git a/ggml-metal.m b/ggml-metal.m
index 0207b787..b5e9884b 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -1396,27 +1396,23 @@
static enum ggml_status ggml_metal_graph_compute(
// to the matrix-vector kernel
int ne11_mm_min = 1;
-#if 0
// the numbers below are measured on M2 Ultra for 7B and 13B models
// these numbers do not translate to other devices or model sizes
// TODO: need to find a better approach
- if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
- switch (src0t) {
- case GGML_TYPE_F16: ne11_mm_min = 2; break;
- case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
- case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
- case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
- case GGML_TYPE_Q4_0:
- case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
- case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
- case GGML_TYPE_Q5_0: // not tested yet
- case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
- case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
- case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
- default: ne11_mm_min = 1; break;
- }
+ switch (src0t) {
+ case GGML_TYPE_F16: ne11_mm_min = 2; break;
+ case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
+ case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
+ case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
+ case GGML_TYPE_Q4_0:
+ case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
+ case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
+ case GGML_TYPE_Q5_0: // not tested yet
+ case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
+ case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
+ case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
+ default: ne11_mm_min = 1; break;
}
-#endif
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
server/sched.go
View file @
9b5a3c59
...
@@ -46,6 +46,7 @@ type Scheduler struct {
...
@@ -46,6 +46,7 @@ type Scheduler struct {
// TODO set this to zero after a release or two, to enable multiple models by default
// TODO set this to zero after a release or two, to enable multiple models by default
var
loadedMax
=
1
// Maximum runners; < 1 maps to as many as will fit in VRAM (unlimited for CPU runners)
var
loadedMax
=
1
// Maximum runners; < 1 maps to as many as will fit in VRAM (unlimited for CPU runners)
var
maxQueuedRequests
=
10
// TODO configurable
var
maxQueuedRequests
=
10
// TODO configurable
var
numParallel
=
1
func
InitScheduler
(
ctx
context
.
Context
)
*
Scheduler
{
func
InitScheduler
(
ctx
context
.
Context
)
*
Scheduler
{
maxRunners
:=
os
.
Getenv
(
"OLLAMA_MAX_LOADED_MODELS"
)
maxRunners
:=
os
.
Getenv
(
"OLLAMA_MAX_LOADED_MODELS"
)
...
@@ -57,6 +58,14 @@ func InitScheduler(ctx context.Context) *Scheduler {
...
@@ -57,6 +58,14 @@ func InitScheduler(ctx context.Context) *Scheduler {
loadedMax
=
m
loadedMax
=
m
}
}
}
}
if
onp
:=
os
.
Getenv
(
"OLLAMA_NUM_PARALLEL"
);
onp
!=
""
{
p
,
err
:=
strconv
.
Atoi
(
onp
)
if
err
!=
nil
||
p
<=
0
{
slog
.
Error
(
"invalid parallel setting, must be greater than zero"
,
"OLLAMA_NUM_PARALLEL"
,
onp
,
"error"
,
err
)
}
else
{
numParallel
=
p
}
}
sched
:=
&
Scheduler
{
sched
:=
&
Scheduler
{
pendingReqCh
:
make
(
chan
*
LlmRequest
,
maxQueuedRequests
),
pendingReqCh
:
make
(
chan
*
LlmRequest
,
maxQueuedRequests
),
...
@@ -81,6 +90,8 @@ func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options,
...
@@ -81,6 +90,8 @@ func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options,
successCh
:
make
(
chan
*
runnerRef
),
successCh
:
make
(
chan
*
runnerRef
),
errCh
:
make
(
chan
error
,
1
),
errCh
:
make
(
chan
error
,
1
),
}
}
// context split across parallel threads
opts
.
NumCtx
=
opts
.
NumCtx
*
numParallel
select
{
select
{
case
s
.
pendingReqCh
<-
req
:
case
s
.
pendingReqCh
<-
req
:
default
:
default
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment