tokenizer.go 2.32 KB
Newer Older
Patrick Devine's avatar
Patrick Devine committed
1
2
3
package convert

import (
Michael Yang's avatar
Michael Yang committed
4
5
	"cmp"
	"crypto/sha256"
Patrick Devine's avatar
Patrick Devine committed
6
	"encoding/json"
Michael Yang's avatar
Michael Yang committed
7
8
	"fmt"
	"log/slog"
Patrick Devine's avatar
Patrick Devine committed
9
	"os"
Michael Yang's avatar
Michael Yang committed
10
11
12
	"slices"

	"golang.org/x/exp/maps"
Patrick Devine's avatar
Patrick Devine committed
13
14
15
16
17
18
)

type Tokenizer struct {
	Version     string         `json:"version"`
	AddedTokens []Token        `json:"added_tokens"`
	Model       TokenizerModel `json:"model"`
Michael Yang's avatar
Michael Yang committed
19
20

	PreTokenizer struct {
21
		PreTokenizers []struct {
Michael Yang's avatar
Michael Yang committed
22
23
24
25
26
27
			Type    string `json:"type"`
			Pattern struct {
				Regex string `json:"Regex"`
			} `json:"pattern"`
		} `json:"pretokenizers"`
	} `json:"pre_tokenizer"`
Patrick Devine's avatar
Patrick Devine committed
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
}

type TokenizerModel struct {
	Type   string         `json:"type"`
	Vocab  map[string]int `json:"vocab"`
	Merges []string       `json:"merges"`
	Tokens []Token
}

type Token struct {
	ID          int    `json:"id"`
	Content     string `json:"content"`
	Special     bool   `json:"special"`
	UserDefined bool
}

Michael Yang's avatar
Michael Yang committed
44
45
46
func (t *Token) Type() int32 {
	switch {
	case t.Special:
Michael Yang's avatar
cleanup  
Michael Yang committed
47
		return tokenTypeControl
Michael Yang's avatar
Michael Yang committed
48
	case t.UserDefined:
Michael Yang's avatar
cleanup  
Michael Yang committed
49
		return tokenTypeUserDefined
Michael Yang's avatar
Michael Yang committed
50
	default:
Michael Yang's avatar
cleanup  
Michael Yang committed
51
		return tokenTypeNormal
Patrick Devine's avatar
Patrick Devine committed
52
	}
Michael Yang's avatar
Michael Yang committed
53
}
Patrick Devine's avatar
Patrick Devine committed
54

Michael Yang's avatar
Michael Yang committed
55
56
57
58
59
60
61
func (t *Tokenizer) maxID() int {
	return max(
		slices.Max(maps.Values(t.Model.Vocab)),
		slices.MaxFunc(t.AddedTokens, func(a, b Token) int {
			return cmp.Compare(a.ID, b.ID)
		}).ID,
	)
Patrick Devine's avatar
Patrick Devine committed
62
63
}

Michael Yang's avatar
Michael Yang committed
64
func parseTokens(dirpath string) (pre string, tokens []Token, merges []string, err error) {
Patrick Devine's avatar
Patrick Devine committed
65
66
67
68
69
70
	f, err := os.Open(dirpath)
	if err != nil {
		panic(err)
	}
	defer f.Close()

Michael Yang's avatar
Michael Yang committed
71
72
73
	var t Tokenizer
	if err := json.NewDecoder(f).Decode(&t); err != nil {
		return "", nil, nil, err
Patrick Devine's avatar
Patrick Devine committed
74
75
	}

Michael Yang's avatar
Michael Yang committed
76
77
78
	tokens = make([]Token, t.maxID()+1)
	for k, v := range t.Model.Vocab {
		tokens[v] = Token{ID: v, Content: k, Special: false, UserDefined: false}
Patrick Devine's avatar
Patrick Devine committed
79
80
	}

Michael Yang's avatar
Michael Yang committed
81
82
83
84
	for _, v := range t.AddedTokens {
		v.UserDefined = true
		tokens[v.ID] = v
	}
Patrick Devine's avatar
Patrick Devine committed
85

Michael Yang's avatar
Michael Yang committed
86
	sha256sum := sha256.New()
87
	for _, pt := range t.PreTokenizer.PreTokenizers {
Michael Yang's avatar
Michael Yang committed
88
89
		if pt.Type == "Split" && pt.Pattern.Regex != "" {
			sha256sum.Write([]byte(pt.Pattern.Regex))
Michael Yang's avatar
Michael Yang committed
90
		}
Patrick Devine's avatar
Patrick Devine committed
91
92
	}

Michael Yang's avatar
Michael Yang committed
93
94
95
96
97
98
99
100
101
102
	switch digest := fmt.Sprintf("%x", sha256sum.Sum(nil)); digest {
	case "d98f9631be1e9607a9848c26c1f9eac1aa9fc21ac6ba82a2fc0741af9780a48f":
		pre = "llama-bpe"
	case "03df5c5863ad70781dcfdef491ead25140f895fe8010964be0daefe27be32b02":
		pre = "deepseek-llm"
	case "21cde974d587f0d54dc8d56b183cc1e6239600172035c68fbd6d4b9f8da0576e":
		pre = "deepseek-coder"
	default:
		slog.Warn("unknown pretokenizer, using default", "digest", digest)
		pre = "default"
Patrick Devine's avatar
Patrick Devine committed
103
104
	}

Michael Yang's avatar
Michael Yang committed
105
	return pre, tokens, t.Model.Merges, nil
Patrick Devine's avatar
Patrick Devine committed
106
}