tokenizer.go 1.33 KB
Newer Older
Patrick Devine's avatar
Patrick Devine committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
package convert

import (
	"encoding/json"
	"io/ioutil"
	"os"
)

type Tokenizer struct {
	Version     string         `json:"version"`
	AddedTokens []Token        `json:"added_tokens"`
	Model       TokenizerModel `json:"model"`
}

type TokenizerModel struct {
	Type   string         `json:"type"`
	Vocab  map[string]int `json:"vocab"`
	Merges []string       `json:"merges"`
	Tokens []Token
}

type Token struct {
	ID          int    `json:"id"`
	Content     string `json:"content"`
	Special     bool   `json:"special"`
	UserDefined bool
}

func (t *Tokenizer) getMaxID() int {
	var maxID int
	for _, v := range t.Model.Vocab {
		maxID = max(maxID, v)
	}

	for _, v := range t.AddedTokens {
		maxID = max(maxID, v.ID)
	}
	return maxID
}

func newTokenizer(dirpath string) (*Tokenizer, error) {
	f, err := os.Open(dirpath)
	if err != nil {
		panic(err)
	}
	defer f.Close()

	data, err := ioutil.ReadAll(f)
	if err != nil {
		return nil, err
	}

	var tdata Tokenizer

	if err := json.Unmarshal(data, &tdata); err != nil {
		return nil, err
	}

	maxID := tdata.getMaxID()
	tdata.Model.Tokens = make([]Token, maxID+1)

	for k, v := range tdata.Model.Vocab {
		tdata.Model.Tokens[v] = Token{ID: v, Content: k, Special: false, UserDefined: false}
	}

	for _, v := range tdata.AddedTokens {
		v.UserDefined = true
		tdata.Model.Tokens[v.ID] = v
	}

	return &tdata, nil
}