Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
44560129
Commit
44560129
authored
Dec 04, 2024
by
Michael Yang
Browse files
fix unmarshaling merges
parent
55c3efa9
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
79 additions
and
4 deletions
+79
-4
convert/tokenizer.go
convert/tokenizer.go
+23
-4
convert/tokenizer_test.go
convert/tokenizer_test.go
+56
-0
No files found.
convert/tokenizer.go
View file @
44560129
...
...
@@ -10,6 +10,7 @@ import (
"log/slog"
"os"
"slices"
"strings"
"golang.org/x/exp/maps"
)
...
...
@@ -60,7 +61,25 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
addedTokens
[
t
.
Content
]
=
t
}
t
.
Merges
=
tt
.
Model
.
Merges
if
len
(
tt
.
Model
.
Merges
)
==
0
{
// noop; merges is empty
}
else
if
err
:=
json
.
Unmarshal
(
tt
.
Model
.
Merges
,
&
t
.
Merges
);
err
==
nil
{
// noop; merges is []string
}
else
if
merges
,
err
:=
func
()
([][]
string
,
error
)
{
var
merges
[][]
string
if
err
:=
json
.
Unmarshal
(
tt
.
Model
.
Merges
,
&
merges
);
err
!=
nil
{
return
nil
,
err
}
return
merges
,
nil
}();
err
==
nil
{
t
.
Merges
=
make
([]
string
,
len
(
merges
))
for
i
:=
range
merges
{
t
.
Merges
[
i
]
=
strings
.
Join
(
merges
[
i
],
" "
)
}
}
else
{
return
nil
,
fmt
.
Errorf
(
"could not parse tokenizer merges. expected []string or [][]string: %w"
,
err
)
}
sha256sum
:=
sha256
.
New
()
for
_
,
pt
:=
range
tt
.
PreTokenizer
.
PreTokenizers
{
...
...
@@ -156,9 +175,9 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
type
tokenizer
struct
{
AddedTokens
[]
token
`json:"added_tokens"`
Model
struct
{
Type
string
`json:"type"`
Vocab
map
[
string
]
int
`json:"vocab"`
Merges
[]
string
`json:"merges"`
Type
string
`json:"type"`
Vocab
map
[
string
]
int
`json:"vocab"`
Merges
json
.
RawMessage
`json:"merges"`
}
`json:"model"`
PreTokenizer
struct
{
...
...
convert/tokenizer_test.go
View file @
44560129
...
...
@@ -191,6 +191,62 @@ func TestParseTokenizer(t *testing.T) {
Pre
:
"default"
,
},
},
{
name
:
"list string merges"
,
fsys
:
createTokenizerFS
(
t
,
t
.
TempDir
(),
map
[
string
]
io
.
Reader
{
"tokenizer.json"
:
strings
.
NewReader
(
`{
"model": {
"merges": [
"a b",
"c d",
"e f"
]
}
}`
),
}),
want
:
&
Tokenizer
{
Vocabulary
:
&
Vocabulary
{
Model
:
"gpt2"
,
},
Merges
:
[]
string
{
"a b"
,
"c d"
,
"e f"
,
},
Pre
:
"default"
,
},
},
{
name
:
"list list string merges"
,
fsys
:
createTokenizerFS
(
t
,
t
.
TempDir
(),
map
[
string
]
io
.
Reader
{
"tokenizer.json"
:
strings
.
NewReader
(
`{
"model": {
"merges": [
[
"a", "b"
],
[
"c", "d"
],
[
"e", "f"
]
]
}
}`
),
}),
want
:
&
Tokenizer
{
Vocabulary
:
&
Vocabulary
{
Model
:
"gpt2"
,
},
Merges
:
[]
string
{
"a b"
,
"c d"
,
"e f"
,
},
Pre
:
"default"
,
},
},
}
for
_
,
tt
:=
range
cases
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment