Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
a40d427b
Unverified
Commit
a40d427b
authored
Sep 23, 2025
by
Michael Yang
Committed by
GitHub
Sep 23, 2025
Browse files
multi-regexp pretokenizer (#12325)
parent
64883e3c
Changes
12
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
124 additions
and
34 deletions
+124
-34
model/bytepairencoding.go
model/bytepairencoding.go
+45
-9
model/bytepairencoding_test.go
model/bytepairencoding_test.go
+39
-1
model/models/gptoss/model.go
model/models/gptoss/model.go
+9
-11
model/models/llama/model.go
model/models/llama/model.go
+24
-4
model/models/llama4/model.go
model/models/llama4/model.go
+1
-2
model/models/mistral3/model.go
model/models/mistral3/model.go
+1
-1
model/models/mllama/model.go
model/models/mllama/model.go
+1
-1
model/models/qwen2/model.go
model/models/qwen2/model.go
+1
-1
model/models/qwen25vl/model.go
model/models/qwen25vl/model.go
+1
-1
model/models/qwen3/embed.go
model/models/qwen3/embed.go
+1
-1
model/models/qwen3/model.go
model/models/qwen3/model.go
+1
-1
sample/samplers_test.go
sample/samplers_test.go
+0
-1
No files found.
model/bytepairencoding.go
View file @
a40d427b
...
@@ -5,6 +5,7 @@ import (
...
@@ -5,6 +5,7 @@ import (
"fmt"
"fmt"
"iter"
"iter"
"log/slog"
"log/slog"
"slices"
"strings"
"strings"
"github.com/dlclark/regexp2"
"github.com/dlclark/regexp2"
...
@@ -13,16 +14,28 @@ import (
...
@@ -13,16 +14,28 @@ import (
)
)
type
BytePairEncoding
struct
{
type
BytePairEncoding
struct
{
pre
*
regexp2
.
Regexp
vocab
*
Vocabulary
vocab
*
Vocabulary
regexps
[]
*
regexp2
.
Regexp
}
}
var
_
TextProcessor
=
(
*
BytePairEncoding
)(
nil
)
var
_
TextProcessor
=
(
*
BytePairEncoding
)(
nil
)
func
NewBytePairEncoding
(
pre
string
,
vocab
*
Vocabulary
)
BytePairEncoding
{
func
NewBytePairEncoding
(
vocab
*
Vocabulary
,
pretokenizers
...
string
)
BytePairEncoding
{
if
len
(
pretokenizers
)
==
0
{
// set default byte-level pretokenizer if none provided, e.g.
// https://github.com/huggingface/tokenizers/blob/main/tokenizers/src/pre_tokenizers/byte_level.rs#L44
pretokenizers
=
[]
string
{
`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`
}
}
return
BytePairEncoding
{
return
BytePairEncoding
{
pre
:
regexp2
.
MustCompile
(
pre
,
regexp2
.
None
),
vocab
:
vocab
,
vocab
:
vocab
,
regexps
:
slices
.
Collect
(
func
(
yield
func
(
*
regexp2
.
Regexp
)
bool
)
{
for
_
,
p
:=
range
pretokenizers
{
if
!
yield
(
regexp2
.
MustCompile
(
p
,
regexp2
.
RE2
))
{
return
}
}
}),
}
}
}
}
...
@@ -35,13 +48,36 @@ func (bpe BytePairEncoding) Is(id int32, special Special) bool {
...
@@ -35,13 +48,36 @@ func (bpe BytePairEncoding) Is(id int32, special Special) bool {
}
}
func
(
bpe
*
BytePairEncoding
)
split
(
s
string
)
iter
.
Seq
[
string
]
{
func
(
bpe
*
BytePairEncoding
)
split
(
s
string
)
iter
.
Seq
[
string
]
{
return
func
(
yield
func
(
string
)
bool
)
{
parts
:=
[]
string
{
s
}
for
m
,
_
:=
bpe
.
pre
.
FindStringMatch
(
s
);
m
!=
nil
;
m
,
_
=
bpe
.
pre
.
FindNextMatch
(
m
)
{
for
_
,
re
:=
range
bpe
.
regexps
{
parts
=
slices
.
Collect
(
func
(
yield
func
(
string
)
bool
)
{
for
_
,
part
:=
range
parts
{
r
:=
[]
rune
(
part
)
var
offset
int
for
m
,
_
:=
re
.
FindRunesMatch
(
r
);
m
!=
nil
;
m
,
_
=
re
.
FindNextMatch
(
m
)
{
if
offset
-
m
.
Index
!=
0
{
if
!
yield
(
string
(
r
[
:
m
.
Index
]))
{
return
}
}
if
!
yield
(
m
.
String
())
{
if
!
yield
(
m
.
String
())
{
break
return
}
offset
=
m
.
Index
+
m
.
Length
}
if
offset
<
len
(
r
)
{
if
!
yield
(
string
(
r
[
offset
:
]))
{
return
}
}
}
}
}
}
})
}
return
slices
.
Values
(
parts
)
}
}
// fragment is a string fragment and their corresponding token IDs
// fragment is a string fragment and their corresponding token IDs
...
...
model/bytepairencoding_test.go
View file @
a40d427b
...
@@ -59,12 +59,12 @@ func llama(t testing.TB) BytePairEncoding {
...
@@ -59,12 +59,12 @@ func llama(t testing.TB) BytePairEncoding {
}
}
return
NewBytePairEncoding
(
return
NewBytePairEncoding
(
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`
,
&
Vocabulary
{
&
Vocabulary
{
Values
:
tokens
,
Values
:
tokens
,
Types
:
types
,
Types
:
types
,
Merges
:
merges
,
Merges
:
merges
,
},
},
"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^
\\
r
\\
n
\\
p{L}
\\
p{N}]?
\\
p{L}+|
\\
p{N}{1,3}| ?[^
\\
s
\\
p{L}
\\
p{N}]+[
\\
r
\\
n]*|
\\
s*[
\\
r
\\
n]+|
\\
s+(?!
\\
S)|
\\
s+"
,
)
)
}
}
...
@@ -282,3 +282,41 @@ func BenchmarkBytePairEncoding(b *testing.B) {
...
@@ -282,3 +282,41 @@ func BenchmarkBytePairEncoding(b *testing.B) {
})
})
}
}
}
}
func
TestSplit
(
t
*
testing
.
T
)
{
cases
:=
[]
struct
{
name
string
patterns
,
want
[]
string
}{
{
name
:
"default"
,
want
:
[]
string
{
"Hello"
,
","
,
" WORLD"
,
"!!"
,
" How"
,
"'s"
,
" it"
,
" going"
,
"?"
,
" 123"
,
" 一二三"
},
},
{
name
:
"unicode"
,
patterns
:
[]
string
{
"
\\
p{N}{1,3}"
,
`[一-龥-ゟ゠-ヿ]+`
,
"[!
\"
#$%&'()*+,
\\
-./:;<=>?@
\\
[
\\\\\\
]^_`{|}~][A-Za-z]+|[^
\r\n\\
p{L}
\\
p{P}
\\
p{S}]?[
\\
p{L}
\\
p{M}]+| ?[
\\
p{P}
\\
p{S}]+[
\r\n
]*|
\\
s*[
\r\n
]+|
\\
s+(?!
\\
S)|
\\
s+"
,
},
want
:
[]
string
{
"Hello"
,
","
,
" WORLD"
,
"!!"
,
" How"
,
"'s"
,
" it"
,
" going"
,
"?"
,
" "
,
"123"
,
" "
,
"一二三"
},
},
{
name
:
"individual digits"
,
patterns
:
[]
string
{
"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^
\\
r
\\
n
\\
p{L}
\\
p{N}]?
\\
p{L}+|
\\
p{N}| ?[^
\\
s
\\
p{L}
\\
p{N}]+[
\\
r
\\
n]*|
\\
s*[
\\
r
\\
n]+|
\\
s+(?!
\\
S)|
\\
s+"
,
},
want
:
[]
string
{
"Hello"
,
","
,
" WORLD"
,
"!!"
,
" How"
,
"'s"
,
" it"
,
" going"
,
"?"
,
" "
,
"1"
,
"2"
,
"3"
,
" 一二三"
},
},
}
for
_
,
tt
:=
range
cases
{
t
.
Run
(
tt
.
name
,
func
(
t
*
testing
.
T
)
{
tokenizer
:=
NewBytePairEncoding
(
nil
,
tt
.
patterns
...
)
if
diff
:=
cmp
.
Diff
(
tt
.
want
,
slices
.
Collect
(
tokenizer
.
split
(
"Hello, WORLD!! How's it going? 123 一二三"
)));
diff
!=
""
{
t
.
Errorf
(
"no match (-theirs +ours):
\n
%s"
,
diff
)
}
})
}
}
model/models/gptoss/model.go
View file @
a40d427b
...
@@ -227,17 +227,6 @@ func New(c fs.Config) (model.Model, error) {
...
@@ -227,17 +227,6 @@ func New(c fs.Config) (model.Model, error) {
m
:=
Transformer
{
m
:=
Transformer
{
TransformerBlocks
:
make
([]
TransformerBlock
,
c
.
Uint
(
"block_count"
)),
TransformerBlocks
:
make
([]
TransformerBlock
,
c
.
Uint
(
"block_count"
)),
BytePairEncoding
:
model
.
NewBytePairEncoding
(
BytePairEncoding
:
model
.
NewBytePairEncoding
(
c
.
String
(
"tokenizer.ggml.pretokenizer"
,
strings
.
Join
([]
string
{
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?`
,
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?`
,
`\p{N}{1,3}`
,
` ?[^\s\p{L}\p{N}]+[\r\n/]*`
,
`\s*[\r\n]+`
,
`\s+(?!\S)`
,
`\s+`
,
},
"|"
),
),
&
model
.
Vocabulary
{
&
model
.
Vocabulary
{
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
...
@@ -250,6 +239,15 @@ func New(c fs.Config) (model.Model, error) {
...
@@ -250,6 +239,15 @@ func New(c fs.Config) (model.Model, error) {
c
.
Ints
(
"tokenizer.ggml.eos_token_ids"
)
...
,
c
.
Ints
(
"tokenizer.ggml.eos_token_ids"
)
...
,
),
),
},
},
strings
.
Join
([]
string
{
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?`
,
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?`
,
`\p{N}{1,3}`
,
` ?[^\s\p{L}\p{N}]+[\r\n/]*`
,
`\s*[\r\n]+`
,
`\s+(?!\S)`
,
`\s+`
,
},
"|"
),
),
),
Options
:
Options
{
Options
:
Options
{
hiddenSize
:
int
(
c
.
Uint
(
"embedding_length"
)),
hiddenSize
:
int
(
c
.
Uint
(
"embedding_length"
)),
...
...
model/models/llama/model.go
View file @
a40d427b
...
@@ -54,10 +54,30 @@ func New(c fs.Config) (model.Model, error) {
...
@@ -54,10 +54,30 @@ func New(c fs.Config) (model.Model, error) {
}
}
switch
c
.
String
(
"tokenizer.ggml.model"
)
{
switch
c
.
String
(
"tokenizer.ggml.model"
)
{
case
"gpt2"
:
case
"gpt2"
:
processor
=
model
.
NewBytePairEncoding
(
var
pretokenizers
[]
string
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`
,
switch
c
.
String
(
"tokenizer.ggml.pre"
)
{
&
vocabulary
,
case
"default"
:
)
// no-op use the default bpe pretokenizer
case
"qwen2"
:
pretokenizers
=
[]
string
{
"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^
\\
r
\\
n
\\
p{L}
\\
p{N}]?
\\
p{L}+|
\\
p{N}| ?[^
\\
s
\\
p{L}
\\
p{N}]+[
\\
r
\\
n]*|
\\
s*[
\\
r
\\
n]+|
\\
s+(?!
\\
S)|
\\
s+"
,
}
case
"refact"
:
pretokenizers
=
[]
string
{
`\p{N}`
,
`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`
,
}
case
"tekken"
:
pretokenizers
=
[]
string
{
"[^
\\
r
\\
n
\\
p{L}
\\
p{N}]?[
\\
p{Lu}
\\
p{Lt}
\\
p{Lm}
\\
p{Lo}
\\
p{M}]*[
\\
p{Ll}
\\
p{Lm}
\\
p{Lo}
\\
p{M}]+|[^
\\
r
\\
n
\\
p{L}
\\
p{N}]?[
\\
p{Lu}
\\
p{Lt}
\\
p{Lm}
\\
p{Lo}
\\
p{M}]+[
\\
p{Ll}
\\
p{Lm}
\\
p{Lo}
\\
p{M}]*|
\\
p{N}| ?[^
\\
s
\\
p{L}
\\
p{N}]+[
\\
r
\\
n/]*|
\\
s*[
\\
r
\\
n]+|
\\
s+(?!
\\
S)|
\\
s+"
,
}
default
:
// use a llama-style pretokenizer
pretokenizers
=
[]
string
{
"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^
\\
r
\\
n
\\
p{L}
\\
p{N}]?
\\
p{L}+|
\\
p{N}{1,3}| ?[^
\\
s
\\
p{L}
\\
p{N}]+[
\\
r
\\
n]*|
\\
s*[
\\
r
\\
n]+|
\\
s+(?!
\\
S)|
\\
s+"
,
}
}
processor
=
model
.
NewBytePairEncoding
(
&
vocabulary
,
pretokenizers
...
)
case
"llama"
:
case
"llama"
:
processor
=
model
.
NewSentencePiece
(
&
vocabulary
)
processor
=
model
.
NewSentencePiece
(
&
vocabulary
)
default
:
default
:
...
...
model/models/llama4/model.go
View file @
a40d427b
...
@@ -34,8 +34,6 @@ func (p *Projector) Forward(ctx ml.Context, visionOutputs ml.Tensor) ml.Tensor {
...
@@ -34,8 +34,6 @@ func (p *Projector) Forward(ctx ml.Context, visionOutputs ml.Tensor) ml.Tensor {
func
New
(
c
fs
.
Config
)
(
model
.
Model
,
error
)
{
func
New
(
c
fs
.
Config
)
(
model
.
Model
,
error
)
{
m
:=
Model
{
m
:=
Model
{
BytePairEncoding
:
model
.
NewBytePairEncoding
(
BytePairEncoding
:
model
.
NewBytePairEncoding
(
c
.
String
(
"tokenizer.ggml.pretokenizer"
,
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`
),
&
model
.
Vocabulary
{
&
model
.
Vocabulary
{
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
...
@@ -48,6 +46,7 @@ func New(c fs.Config) (model.Model, error) {
...
@@ -48,6 +46,7 @@ func New(c fs.Config) (model.Model, error) {
c
.
Ints
(
"tokenizer.ggml.eos_token_ids"
)
...
,
c
.
Ints
(
"tokenizer.ggml.eos_token_ids"
)
...
,
),
),
},
},
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`
,
),
),
ImageProcessor
:
newImageProcessor
(
c
),
ImageProcessor
:
newImageProcessor
(
c
),
VisionModel
:
newVisionModel
(
c
),
VisionModel
:
newVisionModel
(
c
),
...
...
model/models/mistral3/model.go
View file @
a40d427b
...
@@ -33,7 +33,6 @@ var _ model.TextProcessor = (*Model)(nil)
...
@@ -33,7 +33,6 @@ var _ model.TextProcessor = (*Model)(nil)
func
New
(
c
fs
.
Config
)
(
model
.
Model
,
error
)
{
func
New
(
c
fs
.
Config
)
(
model
.
Model
,
error
)
{
m
:=
&
Model
{
m
:=
&
Model
{
BytePairEncoding
:
model
.
NewBytePairEncoding
(
BytePairEncoding
:
model
.
NewBytePairEncoding
(
c
.
String
(
"tokenizer.ggml.pretokenizer"
,
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`
),
&
model
.
Vocabulary
{
&
model
.
Vocabulary
{
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
...
@@ -46,6 +45,7 @@ func New(c fs.Config) (model.Model, error) {
...
@@ -46,6 +45,7 @@ func New(c fs.Config) (model.Model, error) {
c
.
Ints
(
"tokenizer.ggml.eos_token_ids"
)
...
,
c
.
Ints
(
"tokenizer.ggml.eos_token_ids"
)
...
,
),
),
},
},
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`
,
),
),
TextModel
:
newTextModel
(
c
),
TextModel
:
newTextModel
(
c
),
VisionModel
:
newVisionModel
(
c
),
VisionModel
:
newVisionModel
(
c
),
...
...
model/models/mllama/model.go
View file @
a40d427b
...
@@ -33,7 +33,6 @@ const (
...
@@ -33,7 +33,6 @@ const (
func
New
(
c
fs
.
Config
)
(
model
.
Model
,
error
)
{
func
New
(
c
fs
.
Config
)
(
model
.
Model
,
error
)
{
m
:=
Model
{
m
:=
Model
{
BytePairEncoding
:
model
.
NewBytePairEncoding
(
BytePairEncoding
:
model
.
NewBytePairEncoding
(
c
.
String
(
"tokenizer.ggml.pretokenizer"
,
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`
),
&
model
.
Vocabulary
{
&
model
.
Vocabulary
{
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
...
@@ -46,6 +45,7 @@ func New(c fs.Config) (model.Model, error) {
...
@@ -46,6 +45,7 @@ func New(c fs.Config) (model.Model, error) {
c
.
Ints
(
"tokenizer.ggml.eos_token_ids"
)
...
,
c
.
Ints
(
"tokenizer.ggml.eos_token_ids"
)
...
,
),
),
},
},
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`
,
),
),
ImageProcessor
:
newImageProcessor
(
c
),
ImageProcessor
:
newImageProcessor
(
c
),
VisionModel
:
newVisionModel
(
c
),
VisionModel
:
newVisionModel
(
c
),
...
...
model/models/qwen2/model.go
View file @
a40d427b
...
@@ -139,7 +139,6 @@ func New(c fs.Config) (model.Model, error) {
...
@@ -139,7 +139,6 @@ func New(c fs.Config) (model.Model, error) {
m
:=
Model
{
m
:=
Model
{
Layers
:
make
([]
DecoderLayer
,
c
.
Uint
(
"block_count"
)),
Layers
:
make
([]
DecoderLayer
,
c
.
Uint
(
"block_count"
)),
BytePairEncoding
:
model
.
NewBytePairEncoding
(
BytePairEncoding
:
model
.
NewBytePairEncoding
(
c
.
String
(
"tokenizer.ggml.pretokenizer"
,
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`
),
&
model
.
Vocabulary
{
&
model
.
Vocabulary
{
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
...
@@ -152,6 +151,7 @@ func New(c fs.Config) (model.Model, error) {
...
@@ -152,6 +151,7 @@ func New(c fs.Config) (model.Model, error) {
c
.
Ints
(
"tokenizer.ggml.eos_token_ids"
)
...
,
c
.
Ints
(
"tokenizer.ggml.eos_token_ids"
)
...
,
),
),
},
},
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`
,
),
),
Options
:
Options
{
Options
:
Options
{
hiddenSize
:
int
(
c
.
Uint
(
"embedding_length"
)),
hiddenSize
:
int
(
c
.
Uint
(
"embedding_length"
)),
...
...
model/models/qwen25vl/model.go
View file @
a40d427b
...
@@ -29,7 +29,6 @@ var _ model.MultimodalProcessor = (*Model)(nil)
...
@@ -29,7 +29,6 @@ var _ model.MultimodalProcessor = (*Model)(nil)
func
New
(
c
fs
.
Config
)
(
model
.
Model
,
error
)
{
func
New
(
c
fs
.
Config
)
(
model
.
Model
,
error
)
{
m
:=
&
Model
{
m
:=
&
Model
{
BytePairEncoding
:
model
.
NewBytePairEncoding
(
BytePairEncoding
:
model
.
NewBytePairEncoding
(
c
.
String
(
"tokenizer.ggml.pretokenizer"
,
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`
),
&
model
.
Vocabulary
{
&
model
.
Vocabulary
{
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
...
@@ -42,6 +41,7 @@ func New(c fs.Config) (model.Model, error) {
...
@@ -42,6 +41,7 @@ func New(c fs.Config) (model.Model, error) {
c
.
Ints
(
"tokenizer.ggml.eos_token_ids"
)
...
,
c
.
Ints
(
"tokenizer.ggml.eos_token_ids"
)
...
,
),
),
},
},
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`
,
),
),
TextModel
:
NewTextModel
(
c
),
TextModel
:
NewTextModel
(
c
),
VisionModel
:
newVisionModel
(
c
),
VisionModel
:
newVisionModel
(
c
),
...
...
model/models/qwen3/embed.go
View file @
a40d427b
...
@@ -35,7 +35,6 @@ func newEmbed(c fs.Config) (model.Model, error) {
...
@@ -35,7 +35,6 @@ func newEmbed(c fs.Config) (model.Model, error) {
}
}
m
:=
embedModel
{
m
:=
embedModel
{
BytePairEncoding
:
model
.
NewBytePairEncoding
(
BytePairEncoding
:
model
.
NewBytePairEncoding
(
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`
,
&
model
.
Vocabulary
{
&
model
.
Vocabulary
{
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
...
@@ -48,6 +47,7 @@ func newEmbed(c fs.Config) (model.Model, error) {
...
@@ -48,6 +47,7 @@ func newEmbed(c fs.Config) (model.Model, error) {
c
.
Ints
(
"tokenizer.ggml.eos_token_ids"
)
...
,
c
.
Ints
(
"tokenizer.ggml.eos_token_ids"
)
...
,
),
),
},
},
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`
,
),
),
Model
:
&
Model
{
Model
:
&
Model
{
Layers
:
layers
,
Layers
:
layers
,
...
...
model/models/qwen3/model.go
View file @
a40d427b
...
@@ -200,7 +200,6 @@ func New(c fs.Config) (model.Model, error) {
...
@@ -200,7 +200,6 @@ func New(c fs.Config) (model.Model, error) {
m
:=
Model
{
m
:=
Model
{
BytePairEncoding
:
model
.
NewBytePairEncoding
(
BytePairEncoding
:
model
.
NewBytePairEncoding
(
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`
,
&
model
.
Vocabulary
{
&
model
.
Vocabulary
{
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
...
@@ -213,6 +212,7 @@ func New(c fs.Config) (model.Model, error) {
...
@@ -213,6 +212,7 @@ func New(c fs.Config) (model.Model, error) {
c
.
Ints
(
"tokenizer.ggml.eos_token_ids"
)
...
,
c
.
Ints
(
"tokenizer.ggml.eos_token_ids"
)
...
,
),
),
},
},
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`
,
),
),
Layers
:
layers
,
Layers
:
layers
,
Options
:
&
Options
{
Options
:
&
Options
{
...
...
sample/samplers_test.go
View file @
a40d427b
...
@@ -82,7 +82,6 @@ func modelHelper(t testing.TB) model.BytePairEncoding {
...
@@ -82,7 +82,6 @@ func modelHelper(t testing.TB) model.BytePairEncoding {
merges
:=
make
([]
string
,
0
,
1
)
merges
:=
make
([]
string
,
0
,
1
)
// Only need vocab for Grammar Test
// Only need vocab for Grammar Test
return
model
.
NewBytePairEncoding
(
return
model
.
NewBytePairEncoding
(
``
,
&
model
.
Vocabulary
{
&
model
.
Vocabulary
{
Values
:
tokens
,
Values
:
tokens
,
Types
:
make
([]
int32
,
len
(
vocab
)),
Types
:
make
([]
int32
,
len
(
vocab
)),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment