Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
60e47573
Commit
60e47573
authored
Aug 27, 2024
by
Michael Yang
Browse files
more tokenizer tests
parent
eae3af68
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
112 additions
and
0 deletions
+112
-0
convert/tokenizer_test.go
convert/tokenizer_test.go
+112
-0
No files found.
convert/tokenizer_test.go
View file @
60e47573
...
@@ -79,6 +79,118 @@ func TestParseTokenizer(t *testing.T) {
...
@@ -79,6 +79,118 @@ func TestParseTokenizer(t *testing.T) {
Template
:
"<default template>"
,
Template
:
"<default template>"
,
},
},
},
},
{
name
:
"added tokens"
,
fsys
:
createTokenizerFS
(
t
,
t
.
TempDir
(),
map
[
string
]
io
.
Reader
{
"tokenizer.json"
:
strings
.
NewReader
(
`{
"added_tokens": [
{
"id": 999,
"content": "<unused999>",
"special": false
}
]
}`
),
}),
want
:
&
Tokenizer
{
Vocabulary
:
&
Vocabulary
{
Model
:
"gpt2"
,
Tokens
:
[]
string
{
"<unused999>"
},
Scores
:
[]
float32
{
999
},
Types
:
[]
int32
{
4
},
},
Pre
:
"default"
,
},
},
{
name
:
"added tokens overlap vocab"
,
fsys
:
createTokenizerFS
(
t
,
t
.
TempDir
(),
map
[
string
]
io
.
Reader
{
"tokenizer.json"
:
strings
.
NewReader
(
`{
"added_tokens": [
{
"id": 0,
"content": "<pad>",
"special": true
}
],
"model": {
"vocab": {
"<pad>": 0
}
}
}`
),
}),
want
:
&
Tokenizer
{
Vocabulary
:
&
Vocabulary
{
Model
:
"gpt2"
,
Tokens
:
[]
string
{
"<pad>"
},
Scores
:
[]
float32
{
0
},
Types
:
[]
int32
{
3
},
},
Pre
:
"default"
,
},
},
{
name
:
"special token types"
,
fsys
:
createTokenizerFS
(
t
,
t
.
TempDir
(),
map
[
string
]
io
.
Reader
{
"tokenizer.json"
:
strings
.
NewReader
(
`{
"added_tokens": [
{
"id": 0,
"content": "<pad>",
"special": true
},
{
"id": 1,
"content": "<eos>",
"special": true
},
{
"id": 2,
"content": "<bos>",
"special": true
},
{
"id": 3,
"content": "<unk>",
"special": true
}
],
"model": {
"vocab": {
"<pad>": 0,
"<eos>": 1,
"<bos>": 2,
"<unk>": 3
}
}
}`
),
"tokenizer_config.json"
:
strings
.
NewReader
(
`{
"add_bos_token": true,
"add_eos_token": false,
"bos_token": "<bos>",
"eos_token": "<eos>",
"pad_token": "<pad>",
"unk_token": "<unk>"
}`
),
}),
specialTokenTypes
:
[]
string
{
"pad"
,
"eos"
,
"bos"
,
"unk"
},
want
:
&
Tokenizer
{
Vocabulary
:
&
Vocabulary
{
Model
:
"gpt2"
,
Tokens
:
[]
string
{
"<pad>"
,
"<eos>"
,
"<bos>"
,
"<unk>"
},
Scores
:
[]
float32
{
0
,
1
,
2
,
3
},
Types
:
[]
int32
{
3
,
3
,
3
,
3
},
},
SpecialVocabulary
:
[]
*
SpecialVocabulary
{
{
Type
:
"pad"
,
Content
:
"<pad>"
,
ID
:
0
,
AddToken
:
false
},
{
Type
:
"eos"
,
Content
:
"<eos>"
,
ID
:
1
,
AddToken
:
false
},
{
Type
:
"bos"
,
Content
:
"<bos>"
,
ID
:
2
,
AddToken
:
true
},
{
Type
:
"unk"
,
Content
:
"<unk>"
,
ID
:
3
,
AddToken
:
false
},
},
Pre
:
"default"
,
},
},
}
}
for
_
,
tt
:=
range
cases
{
for
_
,
tt
:=
range
cases
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment