Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
3003fc03
Commit
3003fc03
authored
Jul 19, 2023
by
Michael Yang
Browse files
update predict code
parent
32aec66e
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
175 additions
and
180 deletions
+175
-180
api/types.go
api/types.go
+4
-1
llama/llama.go
llama/llama.go
+162
-81
llama/utils.go
llama/utils.go
+9
-98
No files found.
api/types.go
View file @
3003fc03
...
@@ -134,6 +134,7 @@ type Options struct {
...
@@ -134,6 +134,7 @@ type Options struct {
// Model options
// Model options
NumCtx
int
`json:"num_ctx,omitempty"`
NumCtx
int
`json:"num_ctx,omitempty"`
NumKeep
int
`json:"num_keep,omitempty"`
NumBatch
int
`json:"num_batch,omitempty"`
NumBatch
int
`json:"num_batch,omitempty"`
NumGPU
int
`json:"num_gpu,omitempty"`
NumGPU
int
`json:"num_gpu,omitempty"`
MainGPU
int
`json:"main_gpu,omitempty"`
MainGPU
int
`json:"main_gpu,omitempty"`
...
@@ -158,6 +159,7 @@ type Options struct {
...
@@ -158,6 +159,7 @@ type Options struct {
Mirostat
int
`json:"mirostat,omitempty"`
Mirostat
int
`json:"mirostat,omitempty"`
MirostatTau
float32
`json:"mirostat_tau,omitempty"`
MirostatTau
float32
`json:"mirostat_tau,omitempty"`
MirostatEta
float32
`json:"mirostat_eta,omitempty"`
MirostatEta
float32
`json:"mirostat_eta,omitempty"`
PenalizeNewline
bool
`json:"penalize_newline,omitempty"`
NumThread
int
`json:"num_thread,omitempty"`
NumThread
int
`json:"num_thread,omitempty"`
}
}
...
@@ -176,7 +178,7 @@ func DefaultOptions() Options {
...
@@ -176,7 +178,7 @@ func DefaultOptions() Options {
UseMMap
:
true
,
UseMMap
:
true
,
UseMLock
:
false
,
UseMLock
:
false
,
RepeatLastN
:
512
,
RepeatLastN
:
64
,
RepeatPenalty
:
1.1
,
RepeatPenalty
:
1.1
,
FrequencyPenalty
:
0.0
,
FrequencyPenalty
:
0.0
,
PresencePenalty
:
0.0
,
PresencePenalty
:
0.0
,
...
@@ -188,6 +190,7 @@ func DefaultOptions() Options {
...
@@ -188,6 +190,7 @@ func DefaultOptions() Options {
Mirostat
:
0
,
Mirostat
:
0
,
MirostatTau
:
5.0
,
MirostatTau
:
5.0
,
MirostatEta
:
0.1
,
MirostatEta
:
0.1
,
PenalizeNewline
:
true
,
NumThread
:
runtime
.
NumCPU
(),
NumThread
:
runtime
.
NumCPU
(),
}
}
...
...
llama/llama.go
View file @
3003fc03
package
llama
package
llama
/*
/*
#cgo CPPFLAGS: -O3 -DNDEBUG
=1
-DGGML_USE_K_QUANTS
#cgo CPPFLAGS: -O3
-Wall -Wextra -Werror -Wno-unused-function -Wno-unused-variable
-DNDEBUG -DGGML_USE_K_QUANTS
#cgo CXXFLAGS: -std=
c
++11
#cgo CXXFLAGS: -std=
gnu
++11
#cgo darwin CPPFLAGS: -DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_METAL_NDEBUG
#cgo darwin CPPFLAGS: -DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_METAL_NDEBUG
#cgo darwin LDFLAGS: -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
#cgo darwin LDFLAGS: -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
#include <stdlib.h>
#include <stdlib.h>
...
@@ -21,6 +21,7 @@ struct llama_sample_options
...
@@ -21,6 +21,7 @@ struct llama_sample_options
int mirostat;
int mirostat;
float mirostat_tau;
float mirostat_tau;
float mirostat_eta;
float mirostat_eta;
bool penalize_newline;
};
};
llama_token llama_sample(
llama_token llama_sample(
...
@@ -37,6 +38,8 @@ llama_token llama_sample(
...
@@ -37,6 +38,8 @@ llama_token llama_sample(
false,
false,
};
};
struct llama_token_data newline = candidates_p.data[llama_token_nl()];
llama_sample_repetition_penalty(
llama_sample_repetition_penalty(
ctx, &candidates_p,
ctx, &candidates_p,
last_tokens, n_last_tokens,
last_tokens, n_last_tokens,
...
@@ -47,6 +50,10 @@ llama_token llama_sample(
...
@@ -47,6 +50,10 @@ llama_token llama_sample(
last_tokens, n_last_tokens,
last_tokens, n_last_tokens,
opts->frequency_penalty, opts->presence_penalty);
opts->frequency_penalty, opts->presence_penalty);
if (!opts->penalize_newline) {
candidates_p.data[llama_token_nl()] = newline;
}
if (opts->temperature <= 0) {
if (opts->temperature <= 0) {
return llama_sample_token_greedy(ctx, &candidates_p);
return llama_sample_token_greedy(ctx, &candidates_p);
}
}
...
@@ -82,9 +89,9 @@ import (
...
@@ -82,9 +89,9 @@ import (
"errors"
"errors"
"fmt"
"fmt"
"io"
"io"
"log"
"os"
"os"
"strings"
"strings"
"time"
"unicode/utf8"
"unicode/utf8"
"unsafe"
"unsafe"
...
@@ -96,6 +103,10 @@ type LLM struct {
...
@@ -96,6 +103,10 @@ type LLM struct {
model
*
C
.
struct_llama_model
model
*
C
.
struct_llama_model
ctx
*
C
.
struct_llama_context
ctx
*
C
.
struct_llama_context
last
[]
C
.
llama_token
embd
[]
C
.
llama_token
cursor
int
api
.
Options
api
.
Options
}
}
...
@@ -152,16 +163,98 @@ func (llm *LLM) Close() {
...
@@ -152,16 +163,98 @@ func (llm *LLM) Close() {
}
}
func
(
llm
*
LLM
)
Predict
(
ctx
[]
int
,
prompt
string
,
fn
func
(
api
.
GenerateResponse
))
error
{
func
(
llm
*
LLM
)
Predict
(
ctx
[]
int
,
prompt
string
,
fn
func
(
api
.
GenerateResponse
))
error
{
if
input
:=
llm
.
tokenize
(
prompt
);
input
!=
nil
{
C
.
llama_reset_timings
(
llm
.
ctx
)
embd
:=
make
([]
C
.
llama_token
,
len
(
ctx
))
for
i
:=
range
ctx
{
tokens
:=
make
([]
C
.
llama_token
,
len
(
ctx
))
embd
[
i
]
=
C
.
llama_token
(
ctx
[
i
])
for
i
:=
range
tokens
{
tokens
[
i
]
=
C
.
llama_token
(
ctx
[
i
])
}
if
len
(
tokens
)
==
0
{
tokens
=
llm
.
tokenize
(
" "
)
}
llm
.
marshalPrompt
(
tokens
,
prompt
)
C
.
llama_set_rng_seed
(
llm
.
ctx
,
C
.
uint
(
llm
.
Seed
))
var
b
bytes
.
Buffer
for
{
token
,
err
:=
llm
.
next
()
if
errors
.
Is
(
err
,
io
.
EOF
)
{
break
}
else
if
err
!=
nil
{
return
err
}
b
.
WriteString
(
llm
.
detokenize
(
token
))
if
utf8
.
Valid
(
b
.
Bytes
())
||
b
.
Len
()
>=
utf8
.
UTFMax
{
fn
(
api
.
GenerateResponse
{
Response
:
b
.
String
()})
b
.
Reset
()
}
}
}
return
llm
.
generate
(
append
(
embd
,
input
...
),
fn
)
last
:=
make
([]
int
,
0
,
len
(
llm
.
last
))
for
_
,
i
:=
range
llm
.
last
{
if
i
!=
0
{
last
=
append
(
last
,
int
(
i
))
}
}
}
return
errors
.
New
(
"llama: tokenize"
)
timings
:=
C
.
llama_get_timings
(
llm
.
ctx
)
fn
(
api
.
GenerateResponse
{
Done
:
true
,
Context
:
last
,
PromptEvalCount
:
int
(
timings
.
n_p_eval
),
PromptEvalDuration
:
parseDurationMs
(
float64
(
timings
.
t_p_eval_ms
)),
EvalCount
:
int
(
timings
.
n_eval
),
EvalDuration
:
parseDurationMs
(
float64
(
timings
.
t_eval_ms
)),
})
return
nil
}
func
(
llm
*
LLM
)
marshalPrompt
(
ctx
[]
C
.
llama_token
,
prompt
string
)
[]
C
.
llama_token
{
tokens
:=
append
(
ctx
,
llm
.
tokenize
(
prompt
)
...
)
if
llm
.
NumKeep
<
0
{
llm
.
NumKeep
=
len
(
tokens
)
}
// min(llm.NumCtx - 4, llm.NumKeep)
if
llm
.
NumCtx
-
4
<
llm
.
NumKeep
{
llm
.
NumKeep
=
llm
.
NumCtx
-
4
}
if
len
(
tokens
)
>=
llm
.
NumCtx
{
// truncate input
numLeft
:=
(
llm
.
NumCtx
-
llm
.
NumKeep
)
/
2
truncated
:=
tokens
[
:
llm
.
NumKeep
]
erasedBlocks
:=
(
len
(
tokens
)
-
llm
.
NumKeep
-
numLeft
-
1
)
/
numLeft
truncated
=
append
(
truncated
,
tokens
[
llm
.
NumKeep
+
erasedBlocks
*
numLeft
:
]
...
)
copy
(
llm
.
last
,
tokens
[
len
(
tokens
)
-
llm
.
NumCtx
:
])
tokens
=
truncated
log
.
Printf
(
"input truncated: num_ctx=%d num_keep=%d num_left=%d num_tokens=%d"
,
llm
.
NumCtx
,
llm
.
NumKeep
,
numLeft
,
len
(
truncated
))
}
else
{
llm
.
last
=
make
([]
C
.
llama_token
,
llm
.
NumCtx
-
len
(
tokens
))
llm
.
last
=
append
(
llm
.
last
,
tokens
...
)
}
var
i
int
for
i
=
0
;
i
<
len
(
llm
.
embd
)
&&
i
<
len
(
tokens
)
&&
llm
.
embd
[
i
]
==
tokens
[
i
];
i
++
{
// noop
}
llm
.
embd
=
tokens
if
i
==
len
(
tokens
)
{
// evaluate at least one token to generate logits
i
--
}
llm
.
cursor
=
i
log
.
Printf
(
"prompt: num_past=%d cached=%v eval=%v"
,
i
,
len
(
llm
.
embd
[
:
i
]),
len
(
llm
.
embd
[
i
:
]))
return
tokens
}
}
func
(
llm
*
LLM
)
tokenize
(
prompt
string
)
[]
C
.
llama_token
{
func
(
llm
*
LLM
)
tokenize
(
prompt
string
)
[]
C
.
llama_token
{
...
@@ -185,98 +278,86 @@ func (llm *LLM) detokenize(tokens ...C.llama_token) string {
...
@@ -185,98 +278,86 @@ func (llm *LLM) detokenize(tokens ...C.llama_token) string {
return
sb
.
String
()
return
sb
.
String
()
}
}
func
(
llm
*
LLM
)
generate
(
input
[]
C
.
llama_token
,
fn
func
(
api
.
GenerateResponse
))
error
{
func
(
llm
*
LLM
)
next
()
(
C
.
llama_token
,
error
)
{
var
opts
C
.
struct_llama_sample_options
if
len
(
llm
.
embd
)
>=
llm
.
NumCtx
{
opts
.
repeat_penalty
=
C
.
float
(
llm
.
RepeatPenalty
)
numLeft
:=
(
llm
.
NumCtx
-
llm
.
NumKeep
)
/
2
opts
.
frequency_penalty
=
C
.
float
(
llm
.
FrequencyPenalty
)
truncated
:=
llm
.
embd
[
:
llm
.
NumKeep
]
opts
.
presence_penalty
=
C
.
float
(
llm
.
PresencePenalty
)
truncated
=
append
(
truncated
,
llm
.
embd
[
len
(
llm
.
embd
)
-
numLeft
:
]
...
)
opts
.
temperature
=
C
.
float
(
llm
.
Temperature
)
opts
.
top_k
=
C
.
int
(
llm
.
TopK
)
opts
.
top_p
=
C
.
float
(
llm
.
TopP
)
opts
.
tfs_z
=
C
.
float
(
llm
.
TFSZ
)
opts
.
typical_p
=
C
.
float
(
llm
.
TypicalP
)
opts
.
mirostat
=
C
.
int
(
llm
.
Mirostat
)
opts
.
mirostat_tau
=
C
.
float
(
llm
.
MirostatTau
)
opts
.
mirostat_eta
=
C
.
float
(
llm
.
MirostatEta
)
output
:=
deque
[
C
.
llama_token
]{
capacity
:
llm
.
NumCtx
}
context
:=
deque
[
int
]{
capacity
:
llm
.
NumCtx
/
2
}
for
_
,
in
:=
range
input
{
context
.
PushLeft
(
int
(
in
))
}
var
b
bytes
.
Buffer
llm
.
embd
=
truncated
for
C
.
llama_get_kv_cache_token_count
(
llm
.
ctx
)
<
C
.
int
(
llm
.
NumCtx
)
{
llm
.
cursor
=
llm
.
NumKeep
if
retval
:=
C
.
llama_eval
(
llm
.
ctx
,
unsafe
.
SliceData
(
input
),
C
.
int
(
len
(
input
)),
C
.
llama_get_kv_cache_token_count
(
llm
.
ctx
),
C
.
int
(
llm
.
NumThread
));
retval
!=
0
{
log
.
Printf
(
"input truncated: num_ctx=%d num_keep=%d num_left=%d num_tokens=%d cursor=%d"
,
llm
.
NumCtx
,
llm
.
NumKeep
,
numLeft
,
len
(
truncated
),
llm
.
cursor
)
return
errors
.
New
(
"llama: eval"
)
}
}
token
,
err
:=
llm
.
sample
(
output
,
&
opts
)
for
{
if
errors
.
Is
(
err
,
io
.
EOF
)
{
if
llm
.
cursor
>=
len
(
llm
.
embd
)
{
break
break
}
else
if
err
!=
nil
{
return
err
}
}
b
.
WriteString
(
llm
.
detokenize
(
token
))
numEval
:=
len
(
llm
.
embd
)
-
llm
.
cursor
if
utf8
.
Valid
(
b
.
Bytes
())
||
b
.
Len
()
>=
utf8
.
UTFMax
{
if
numEval
>
llm
.
NumBatch
{
// call the callback
numEval
=
llm
.
NumBatch
fn
(
api
.
GenerateResponse
{
Response
:
b
.
String
(),
})
output
.
PushLeft
(
token
)
context
.
PushLeft
(
int
(
token
))
b
.
Reset
()
}
}
input
=
[]
C
.
llama_token
{
token
}
if
retval
:=
C
.
llama_eval
(
llm
.
ctx
,
unsafe
.
SliceData
(
llm
.
embd
[
llm
.
cursor
:
]),
C
.
int
(
numEval
),
C
.
int
(
llm
.
cursor
),
C
.
int
(
llm
.
NumThread
));
retval
!=
0
{
}
return
0
,
fmt
.
Errorf
(
"llama_eval: %d"
,
retval
)
dur
:=
func
(
ms
float64
)
time
.
Duration
{
d
,
err
:=
time
.
ParseDuration
(
fmt
.
Sprintf
(
"%fms"
,
ms
))
if
err
!=
nil
{
panic
(
err
)
}
}
return
d
llm
.
cursor
+=
numEval
}
}
timings
:=
C
.
llama_get_timings
(
llm
.
ctx
)
var
sampleOpts
C
.
struct_llama_sample_options
fn
(
api
.
GenerateResponse
{
sampleOpts
.
repeat_penalty
=
C
.
float
(
llm
.
RepeatPenalty
)
Done
:
true
,
sampleOpts
.
frequency_penalty
=
C
.
float
(
llm
.
FrequencyPenalty
)
Context
:
context
.
Data
(),
sampleOpts
.
presence_penalty
=
C
.
float
(
llm
.
PresencePenalty
)
PromptEvalCount
:
int
(
timings
.
n_p_eval
),
sampleOpts
.
temperature
=
C
.
float
(
llm
.
Temperature
)
PromptEvalDuration
:
dur
(
float64
(
timings
.
t_p_eval_ms
)),
sampleOpts
.
top_k
=
C
.
int
(
llm
.
TopK
)
EvalCount
:
int
(
timings
.
n_eval
),
sampleOpts
.
top_p
=
C
.
float
(
llm
.
TopP
)
EvalDuration
:
dur
(
float64
(
timings
.
t_eval_ms
)),
sampleOpts
.
tfs_z
=
C
.
float
(
llm
.
TFSZ
)
}
)
sampleOpts
.
typical_p
=
C
.
float
(
llm
.
TypicalP
)
sampleOpts
.
mirostat
=
C
.
int
(
llm
.
Mirostat
)
return
nil
sampleOpts
.
mirostat_tau
=
C
.
float
(
llm
.
MirostatTau
)
}
sampleOpts
.
mirostat_eta
=
C
.
float
(
llm
.
MirostatEta
)
sampleOpts
.
penalize_newline
=
C
.
bool
(
llm
.
PenalizeNewline
)
func
(
llm
*
LLM
)
sample
(
output
deque
[
C
.
llama_token
],
opts
*
C
.
struct_llama_sample_options
)
(
C
.
llama_token
,
error
)
{
numVocab
:=
int
(
C
.
llama_n_vocab
(
llm
.
ctx
)
)
numVocab
:=
C
.
llama_n_vocab
(
llm
.
ctx
)
logits
:=
unsafe
.
Slice
(
C
.
llama_get_logits
(
llm
.
ctx
),
numVocab
)
logits
:=
unsafe
.
Slice
(
C
.
llama_get_logits
(
llm
.
ctx
),
numVocab
)
candidates
:=
deque
[
C
.
struct_llama_token_data
]{
capacity
:
numVocab
}
// TODO: logit bias
for
i
:=
0
;
i
<
candidates
.
Cap
();
i
++
{
candidates
.
PushLeft
(
C
.
struct_llama_token_data
{
candidates
:=
make
([]
C
.
llama_token_data
,
numVocab
)
for
i
:=
range
logits
{
candidates
[
i
]
=
C
.
llama_token_data
{
id
:
C
.
int
(
i
),
id
:
C
.
int
(
i
),
logit
:
logits
[
i
],
logit
:
logits
[
i
],
p
:
0
,
p
:
0
,
}
)
}
}
}
repeatLastN
:=
llm
.
RepeatLastN
if
len
(
llm
.
last
)
<
repeatLastN
{
repeatLastN
=
len
(
llm
.
last
)
}
if
llm
.
NumCtx
<
repeatLastN
{
repeatLastN
=
llm
.
NumCtx
}
lastN
:=
llm
.
last
[
len
(
llm
.
last
)
-
repeatLastN
:
]
token
:=
C
.
llama_sample
(
token
:=
C
.
llama_sample
(
llm
.
ctx
,
llm
.
ctx
,
unsafe
.
SliceData
(
candidates
.
Data
()),
C
.
size_t
(
candidates
.
Len
()),
unsafe
.
SliceData
(
candidates
),
C
.
size_t
(
len
(
candidates
)),
unsafe
.
SliceData
(
output
.
Data
()),
C
.
size_t
(
output
.
Len
()),
unsafe
.
SliceData
(
lastN
),
C
.
size_t
(
len
(
lastN
)),
opts
)
&
sampleOpts
,
if
token
!=
C
.
llama_token_eos
()
{
)
return
token
,
nil
llm
.
last
=
append
(
llm
.
last
,
token
)
llm
.
embd
=
append
(
llm
.
embd
,
token
)
if
token
==
C
.
llama_token_eos
()
{
return
0
,
io
.
EOF
}
}
return
0
,
io
.
EOF
return
token
,
nil
}
}
llama/utils.go
View file @
3003fc03
package
llama
package
llama
type
node
[
T
any
]
struct
{
import
(
t
T
"fmt"
next
*
node
[
T
]
"time"
prev
*
node
[
T
]
)
}
type
deque
[
T
any
]
struct
{
head
*
node
[
T
]
tail
*
node
[
T
]
size
int
capacity
int
}
func
(
d
*
deque
[
T
])
Empty
()
bool
{
return
d
.
size
==
0
}
func
(
d
*
deque
[
T
])
Len
()
int
{
return
d
.
size
}
func
(
d
*
deque
[
T
])
Cap
()
int
{
return
d
.
capacity
}
func
(
d
*
deque
[
T
])
Push
(
t
T
)
{
if
d
.
capacity
>
0
&&
d
.
size
>=
d
.
capacity
{
d
.
PopLeft
()
}
n
:=
node
[
T
]{
t
:
t
}
if
d
.
head
!=
nil
{
n
.
next
=
d
.
head
d
.
head
.
prev
=
&
n
d
.
head
=
&
n
}
else
{
d
.
head
=
&
n
d
.
tail
=
&
n
}
d
.
size
++
}
func
(
d
*
deque
[
T
])
PushLeft
(
t
T
)
{
if
d
.
capacity
>
0
&&
d
.
size
>=
d
.
capacity
{
d
.
Pop
()
}
n
:=
node
[
T
]{
t
:
t
}
if
d
.
tail
!=
nil
{
n
.
prev
=
d
.
tail
d
.
tail
.
next
=
&
n
d
.
tail
=
&
n
}
else
{
d
.
head
=
&
n
d
.
tail
=
&
n
}
d
.
size
++
}
func
(
d
*
deque
[
T
])
Pop
()
*
T
{
if
d
.
Empty
()
{
return
nil
}
head
:=
d
.
head
d
.
head
=
head
.
next
if
d
.
head
!=
nil
{
d
.
head
.
prev
=
nil
}
else
{
d
.
tail
=
nil
}
d
.
size
--
return
&
head
.
t
}
func
(
d
*
deque
[
T
])
PopLeft
()
*
T
{
if
d
.
Empty
()
{
return
nil
}
tail
:=
d
.
tail
d
.
tail
=
tail
.
prev
if
d
.
tail
!=
nil
{
d
.
tail
.
next
=
nil
}
else
{
d
.
head
=
nil
}
d
.
size
--
return
&
tail
.
t
}
func
(
d
*
deque
[
T
])
Data
()
(
data
[]
T
)
{
func
parseDurationMs
(
ms
float64
)
time
.
Duration
{
for
n
:=
d
.
head
;
n
!=
nil
;
n
=
n
.
next
{
dur
,
err
:=
time
.
ParseDuration
(
fmt
.
Sprintf
(
"%fms"
,
ms
))
data
=
append
(
data
,
n
.
t
)
if
err
!=
nil
{
panic
(
err
)
}
}
return
d
ata
return
d
ur
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment