Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
9194874d
".github/git@developer.sourcefind.cn:OpenDAS/autoawq.git" did not exist on "24b98c251b87c21b04bfc7e28dc803392da6ce21"
Commit
9194874d
authored
Jul 26, 2025
by
Michael Yang
Browse files
update tokenizer
parent
9d1de41b
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
13 additions
and
2 deletions
+13
-2
model/bytepairencoding.go
model/bytepairencoding.go
+1
-1
model/models/gptoss/model.go
model/models/gptoss/model.go
+12
-1
No files found.
model/bytepairencoding.go
View file @
9194874d
...
...
@@ -22,7 +22,7 @@ var _ TextProcessor = (*BytePairEncoding)(nil)
func
NewBytePairEncoding
(
pre
string
,
vocab
*
Vocabulary
)
BytePairEncoding
{
return
BytePairEncoding
{
pre
:
regexp2
.
MustCompile
(
pre
,
regexp2
.
Unicode
|
regexp2
.
RE2
),
pre
:
regexp2
.
MustCompile
(
pre
,
regexp2
.
None
),
vocab
:
vocab
,
}
}
...
...
model/models/gptoss/model.go
View file @
9194874d
...
...
@@ -3,6 +3,7 @@ package gptoss
import
(
"cmp"
"math"
"strings"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/kvcache"
...
...
@@ -216,7 +217,17 @@ func New(c fs.Config) (model.Model, error) {
m
:=
Transformer
{
TransformerBlocks
:
make
([]
TransformerBlock
,
c
.
Uint
(
"block_count"
)),
BytePairEncoding
:
model
.
NewBytePairEncoding
(
c
.
String
(
"tokenizer.ggml.pretokenizer"
,
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`
),
c
.
String
(
"tokenizer.ggml.pretokenizer"
,
strings
.
Join
([]
string
{
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?`
,
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?`
,
`\p{N}{1,3}`
,
` ?[^\s\p{L}\p{N}]+[\r\n/]*`
,
`\s*[\r\n]+`
,
`\s+(?!\S)`
,
`\s+`
,
},
"|"
),
),
&
model
.
Vocabulary
{
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment