parser.go 7.11 KB
Newer Older
1
package parser
2
3
4

import (
	"bufio"
5
	"bytes"
Michael Yang's avatar
Michael Yang committed
6
	"encoding/binary"
7
	"errors"
8
	"fmt"
9
	"io"
Michael Yang's avatar
Michael Yang committed
10
	"log/slog"
Michael Yang's avatar
Michael Yang committed
11
12
	"strconv"
	"strings"
Michael Yang's avatar
Michael Yang committed
13
14
	"unicode/utf16"
	"unicode/utf8"
15
16
)

Michael Yang's avatar
Michael Yang committed
17
18
19
20
21
22
23
24
25
26
27
28
29
type File struct {
	Commands []Command
}

func (f File) String() string {
	var sb strings.Builder
	for _, cmd := range f.Commands {
		fmt.Fprintln(&sb, cmd.String())
	}

	return sb.String()
}

30
31
type Command struct {
	Name string
32
33
34
	Args string
}

Michael Yang's avatar
Michael Yang committed
35
func (c Command) String() string {
Michael Yang's avatar
Michael Yang committed
36
	var sb strings.Builder
Michael Yang's avatar
Michael Yang committed
37
38
	switch c.Name {
	case "model":
Michael Yang's avatar
Michael Yang committed
39
		fmt.Fprintf(&sb, "FROM %s", c.Args)
Michael Yang's avatar
Michael Yang committed
40
	case "license", "template", "system", "adapter":
Michael Yang's avatar
Michael Yang committed
41
		fmt.Fprintf(&sb, "%s %s", strings.ToUpper(c.Name), quote(c.Args))
Michael Yang's avatar
Michael Yang committed
42
43
	case "message":
		role, message, _ := strings.Cut(c.Args, ": ")
Michael Yang's avatar
Michael Yang committed
44
		fmt.Fprintf(&sb, "MESSAGE %s %s", role, quote(message))
Michael Yang's avatar
Michael Yang committed
45
	default:
Michael Yang's avatar
Michael Yang committed
46
		fmt.Fprintf(&sb, "PARAMETER %s %s", c.Name, quote(c.Args))
Michael Yang's avatar
Michael Yang committed
47
48
	}

Michael Yang's avatar
Michael Yang committed
49
	return sb.String()
Michael Yang's avatar
Michael Yang committed
50
51
}

Michael Yang's avatar
Michael Yang committed
52
type state int
53

Michael Yang's avatar
Michael Yang committed
54
55
56
57
58
59
60
61
const (
	stateNil state = iota
	stateName
	stateValue
	stateParameter
	stateMessage
	stateComment
)
62

Michael Yang's avatar
tests  
Michael Yang committed
63
var (
Michael Yang's avatar
Michael Yang committed
64
65
66
	errMissingFrom        = errors.New("no FROM line")
	errInvalidMessageRole = errors.New("message role must be one of \"system\", \"user\", or \"assistant\"")
	errInvalidCommand     = errors.New("command must be one of \"from\", \"license\", \"template\", \"system\", \"adapter\", \"parameter\", or \"message\"")
Michael Yang's avatar
tests  
Michael Yang committed
67
)
Michael Yang's avatar
Michael Yang committed
68

Michael Yang's avatar
Michael Yang committed
69
func ParseFile(r io.Reader) (*File, error) {
Michael Yang's avatar
Michael Yang committed
70
71
72
73
74
	var cmd Command
	var curr state
	var b bytes.Buffer
	var role string

Michael Yang's avatar
Michael Yang committed
75
76
	var f File

Michael Yang's avatar
Michael Yang committed
77
	br := bufio.NewReader(r)
78

Michael Yang's avatar
Michael Yang committed
79
80
81
82
83
84
85
86
87
88
89
90
	var sc scannerDecoder = utf8ScannerDecoder{}
	if bom, err := br.Peek(2); err != nil {
		slog.Warn("error reading byte-order mark", "error", err)
	} else if bytes.Equal(bom, []byte{0xFE, 0xFF}) {
		sc = utf16ScannerDecoder{binary.LittleEndian}
		//nolint:errcheck
		br.Discard(2)
	} else if bytes.Equal(bom, []byte{0xFF, 0xFE}) {
		sc = utf16ScannerDecoder{binary.BigEndian}
		//nolint:errcheck
		br.Discard(2)
	}
91

Michael Yang's avatar
Michael Yang committed
92
93
94
95
96
97
	scanner := bufio.NewScanner(br)
	scanner.Split(sc.ScanBytes)
	for scanner.Scan() {
		r, err := sc.DecodeRune(scanner.Bytes())
		if err != nil {
			return nil, err
98
99
		}

Michael Yang's avatar
Michael Yang committed
100
101
102
103
104
		next, r, err := parseRuneForState(r, curr)
		if errors.Is(err, io.ErrUnexpectedEOF) {
			return nil, fmt.Errorf("%w: %s", err, b.String())
		} else if err != nil {
			return nil, err
105
106
		}

Michael Yang's avatar
Michael Yang committed
107
		// process the state transition, some transitions need to be intercepted and redirected
Michael Yang's avatar
Michael Yang committed
108
109
		if next != curr {
			switch curr {
Michael Yang's avatar
Michael Yang committed
110
111
112
113
114
			case stateName:
				if !isValidCommand(b.String()) {
					return nil, errInvalidCommand
				}

Michael Yang's avatar
Michael Yang committed
115
				// next state sometimes depends on the current buffer value
Michael Yang's avatar
Michael Yang committed
116
117
118
119
				switch s := strings.ToLower(b.String()); s {
				case "from":
					cmd.Name = "model"
				case "parameter":
Michael Yang's avatar
Michael Yang committed
120
					// transition to stateParameter which sets command name
Michael Yang's avatar
Michael Yang committed
121
122
					next = stateParameter
				case "message":
Michael Yang's avatar
Michael Yang committed
123
					// transition to stateMessage which validates the message role
Michael Yang's avatar
Michael Yang committed
124
125
126
127
128
					next = stateMessage
					fallthrough
				default:
					cmd.Name = s
				}
Michael Yang's avatar
Michael Yang committed
129
130
			case stateParameter:
				cmd.Name = b.String()
Michael Yang's avatar
Michael Yang committed
131
			case stateMessage:
Michael Yang's avatar
Michael Yang committed
132
				if !isValidMessageRole(b.String()) {
Michael Yang's avatar
Michael Yang committed
133
					return nil, errInvalidMessageRole
Michael Yang's avatar
Michael Yang committed
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
				}

				role = b.String()
			case stateComment, stateNil:
				// pass
			case stateValue:
				s, ok := unquote(b.String())
				if !ok || isSpace(r) {
					if _, err := b.WriteRune(r); err != nil {
						return nil, err
					}

					continue
				}

				if role != "" {
					s = role + ": " + s
					role = ""
				}

				cmd.Args = s
Michael Yang's avatar
Michael Yang committed
155
				f.Commands = append(f.Commands, cmd)
Michael Yang's avatar
Michael Yang committed
156
157
			}

Michael Yang's avatar
Michael Yang committed
158
159
160
161
162
163
164
			b.Reset()
			curr = next
		}

		if strconv.IsPrint(r) {
			if _, err := b.WriteRune(r); err != nil {
				return nil, err
Michael Yang's avatar
Michael Yang committed
165
			}
Michael Yang's avatar
Michael Yang committed
166
167
168
169
170
171
172
173
		}
	}

	// flush the buffer
	switch curr {
	case stateComment, stateNil:
		// pass; nothing to flush
	case stateValue:
Michael Yang's avatar
Michael Yang committed
174
175
		s, ok := unquote(b.String())
		if !ok {
Michael Yang's avatar
Michael Yang committed
176
			return nil, io.ErrUnexpectedEOF
177
		}
178

Michael Yang's avatar
Michael Yang committed
179
180
181
182
183
		if role != "" {
			s = role + ": " + s
		}

		cmd.Args = s
Michael Yang's avatar
Michael Yang committed
184
		f.Commands = append(f.Commands, cmd)
Michael Yang's avatar
Michael Yang committed
185
186
	default:
		return nil, io.ErrUnexpectedEOF
187
188
	}

Michael Yang's avatar
Michael Yang committed
189
	for _, cmd := range f.Commands {
Michael Yang's avatar
Michael Yang committed
190
		if cmd.Name == "model" {
Michael Yang's avatar
Michael Yang committed
191
			return &f, nil
Michael Yang's avatar
Michael Yang committed
192
		}
193
194
	}

Michael Yang's avatar
tests  
Michael Yang committed
195
	return nil, errMissingFrom
196
}
197

Michael Yang's avatar
Michael Yang committed
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
func parseRuneForState(r rune, cs state) (state, rune, error) {
	switch cs {
	case stateNil:
		switch {
		case r == '#':
			return stateComment, 0, nil
		case isSpace(r), isNewline(r):
			return stateNil, 0, nil
		default:
			return stateName, r, nil
		}
	case stateName:
		switch {
		case isAlpha(r):
			return stateName, r, nil
		case isSpace(r):
			return stateValue, 0, nil
		default:
Michael Yang's avatar
Michael Yang committed
216
			return stateNil, 0, errInvalidCommand
Michael Yang's avatar
Michael Yang committed
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
		}
	case stateValue:
		switch {
		case isNewline(r):
			return stateNil, r, nil
		case isSpace(r):
			return stateNil, r, nil
		default:
			return stateValue, r, nil
		}
	case stateParameter:
		switch {
		case isAlpha(r), isNumber(r), r == '_':
			return stateParameter, r, nil
		case isSpace(r):
			return stateValue, 0, nil
		default:
			return stateNil, 0, io.ErrUnexpectedEOF
		}
	case stateMessage:
		switch {
		case isAlpha(r):
			return stateMessage, r, nil
		case isSpace(r):
			return stateValue, 0, nil
		default:
			return stateNil, 0, io.ErrUnexpectedEOF
		}
	case stateComment:
		switch {
		case isNewline(r):
			return stateNil, 0, nil
		default:
			return stateComment, 0, nil
		}
	default:
		return stateNil, 0, errors.New("")
254
	}
Michael Yang's avatar
Michael Yang committed
255
}
256

Michael Yang's avatar
Michael Yang committed
257
func quote(s string) string {
258
	if strings.Contains(s, "\n") || strings.HasPrefix(s, " ") || strings.HasSuffix(s, " ") {
Michael Yang's avatar
Michael Yang committed
259
260
261
262
		if strings.Contains(s, "\"") {
			return `"""` + s + `"""`
		}

263
		return `"` + s + `"`
Michael Yang's avatar
Michael Yang committed
264
265
266
267
268
	}

	return s
}

Michael Yang's avatar
Michael Yang committed
269
270
271
272
273
274
275
276
func unquote(s string) (string, bool) {
	// TODO: single quotes
	if len(s) >= 3 && s[:3] == `"""` {
		if len(s) >= 6 && s[len(s)-3:] == `"""` {
			return s[3 : len(s)-3], true
		}

		return "", false
277
278
	}

Michael Yang's avatar
Michael Yang committed
279
280
281
282
283
284
	if len(s) >= 1 && s[0] == '"' {
		if len(s) >= 2 && s[len(s)-1] == '"' {
			return s[1 : len(s)-1], true
		}

		return "", false
285
286
	}

Michael Yang's avatar
Michael Yang committed
287
	return s, true
288
289
}

Michael Yang's avatar
Michael Yang committed
290
291
292
func isAlpha(r rune) bool {
	return r >= 'a' && r <= 'z' || r >= 'A' && r <= 'Z'
}
293

Michael Yang's avatar
Michael Yang committed
294
295
296
func isNumber(r rune) bool {
	return r >= '0' && r <= '9'
}
297

Michael Yang's avatar
Michael Yang committed
298
299
300
func isSpace(r rune) bool {
	return r == ' ' || r == '\t'
}
Michael Yang's avatar
Michael Yang committed
301

Michael Yang's avatar
Michael Yang committed
302
303
304
func isNewline(r rune) bool {
	return r == '\r' || r == '\n'
}
305

Michael Yang's avatar
Michael Yang committed
306
func isValidMessageRole(role string) bool {
Michael Yang's avatar
Michael Yang committed
307
	return role == "system" || role == "user" || role == "assistant"
308
}
Michael Yang's avatar
Michael Yang committed
309
310
311
312
313
314
315
316
317

func isValidCommand(cmd string) bool {
	switch strings.ToLower(cmd) {
	case "from", "license", "template", "system", "adapter", "parameter", "message":
		return true
	default:
		return false
	}
}
Michael Yang's avatar
Michael Yang committed
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353

type scannerDecoder interface {
	ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error)
	DecodeRune([]byte) (rune, error)
}

type utf8ScannerDecoder struct{}

func (utf8ScannerDecoder) ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error) {
	return scanBytesN(data, 1, atEOF)
}

func (utf8ScannerDecoder) DecodeRune(data []byte) (rune, error) {
	r, _ := utf8.DecodeRune(data)
	return r, nil
}

type utf16ScannerDecoder struct {
	binary.ByteOrder
}

func (utf16ScannerDecoder) ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error) {
	return scanBytesN(data, 2, atEOF)
}

func (e utf16ScannerDecoder) DecodeRune(data []byte) (rune, error) {
	return utf16.Decode([]uint16{e.ByteOrder.Uint16(data)})[0], nil
}

func scanBytesN(data []byte, n int, atEOF bool) (int, []byte, error) {
	if atEOF && len(data) == 0 {
		return 0, nil, nil
	}

	return n, data[:n], nil
}