Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
66ab4877
Commit
66ab4877
authored
May 29, 2024
by
Michael Yang
Browse files
proper utf16 support
parent
22fcf8f7
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
58 additions
and
32 deletions
+58
-32
parser/parser.go
parser/parser.go
+58
-32
No files found.
parser/parser.go
View file @
66ab4877
...
@@ -3,12 +3,15 @@ package parser
...
@@ -3,12 +3,15 @@ package parser
import
(
import
(
"bufio"
"bufio"
"bytes"
"bytes"
"encoding/binary"
"errors"
"errors"
"fmt"
"fmt"
"io"
"io"
"log/slog"
"strconv"
"strconv"
"strings"
"strings"
"unicode"
"unicode/utf16"
"unicode/utf8"
)
)
type
File
struct
{
type
File
struct
{
...
@@ -69,31 +72,29 @@ func ParseFile(r io.Reader) (*File, error) {
...
@@ -69,31 +72,29 @@ func ParseFile(r io.Reader) (*File, error) {
var
b
bytes
.
Buffer
var
b
bytes
.
Buffer
var
role
string
var
role
string
var
lineCount
int
var
linePos
int
var
utf16
bool
var
f
File
var
f
File
br
:=
bufio
.
NewReader
(
r
)
br
:=
bufio
.
NewReader
(
r
)
for
{
r
,
_
,
err
:=
br
.
ReadRune
()
if
errors
.
Is
(
err
,
io
.
EOF
)
{
break
}
else
if
err
!=
nil
{
return
nil
,
err
}
// the utf16 byte order mark will be read as "unreadable" by ReadRune()
if
isUnreadable
(
r
)
&&
lineCount
==
0
&&
linePos
==
0
{
utf16
=
true
continue
}
// skip the second byte if we're reading utf16
var
sc
scannerDecoder
=
utf8ScannerDecoder
{}
if
utf16
&&
r
==
0
{
if
bom
,
err
:=
br
.
Peek
(
2
);
err
!=
nil
{
continue
slog
.
Warn
(
"error reading byte-order mark"
,
"error"
,
err
)
}
else
if
bytes
.
Equal
(
bom
,
[]
byte
{
0xFE
,
0xFF
})
{
sc
=
utf16ScannerDecoder
{
binary
.
LittleEndian
}
//nolint:errcheck
br
.
Discard
(
2
)
}
else
if
bytes
.
Equal
(
bom
,
[]
byte
{
0xFF
,
0xFE
})
{
sc
=
utf16ScannerDecoder
{
binary
.
BigEndian
}
//nolint:errcheck
br
.
Discard
(
2
)
}
scanner
:=
bufio
.
NewScanner
(
br
)
scanner
.
Split
(
sc
.
ScanBytes
)
for
scanner
.
Scan
()
{
r
,
err
:=
sc
.
DecodeRune
(
scanner
.
Bytes
())
if
err
!=
nil
{
return
nil
,
err
}
}
next
,
r
,
err
:=
parseRuneForState
(
r
,
curr
)
next
,
r
,
err
:=
parseRuneForState
(
r
,
curr
)
...
@@ -103,13 +104,6 @@ func ParseFile(r io.Reader) (*File, error) {
...
@@ -103,13 +104,6 @@ func ParseFile(r io.Reader) (*File, error) {
return
nil
,
err
return
nil
,
err
}
}
if
isNewline
(
r
)
{
lineCount
++
linePos
=
0
}
else
{
linePos
++
}
// process the state transition, some transitions need to be intercepted and redirected
// process the state transition, some transitions need to be intercepted and redirected
if
next
!=
curr
{
if
next
!=
curr
{
switch
curr
{
switch
curr
{
...
@@ -309,10 +303,6 @@ func isNewline(r rune) bool {
...
@@ -309,10 +303,6 @@ func isNewline(r rune) bool {
return
r
==
'\r'
||
r
==
'\n'
return
r
==
'\r'
||
r
==
'\n'
}
}
func
isUnreadable
(
r
rune
)
bool
{
return
r
==
unicode
.
ReplacementChar
}
func
isValidMessageRole
(
role
string
)
bool
{
func
isValidMessageRole
(
role
string
)
bool
{
return
role
==
"system"
||
role
==
"user"
||
role
==
"assistant"
return
role
==
"system"
||
role
==
"user"
||
role
==
"assistant"
}
}
...
@@ -325,3 +315,39 @@ func isValidCommand(cmd string) bool {
...
@@ -325,3 +315,39 @@ func isValidCommand(cmd string) bool {
return
false
return
false
}
}
}
}
type
scannerDecoder
interface
{
ScanBytes
(
data
[]
byte
,
atEOF
bool
)
(
advance
int
,
token
[]
byte
,
err
error
)
DecodeRune
([]
byte
)
(
rune
,
error
)
}
type
utf8ScannerDecoder
struct
{}
func
(
utf8ScannerDecoder
)
ScanBytes
(
data
[]
byte
,
atEOF
bool
)
(
advance
int
,
token
[]
byte
,
err
error
)
{
return
scanBytesN
(
data
,
1
,
atEOF
)
}
func
(
utf8ScannerDecoder
)
DecodeRune
(
data
[]
byte
)
(
rune
,
error
)
{
r
,
_
:=
utf8
.
DecodeRune
(
data
)
return
r
,
nil
}
type
utf16ScannerDecoder
struct
{
binary
.
ByteOrder
}
func
(
utf16ScannerDecoder
)
ScanBytes
(
data
[]
byte
,
atEOF
bool
)
(
advance
int
,
token
[]
byte
,
err
error
)
{
return
scanBytesN
(
data
,
2
,
atEOF
)
}
func
(
e
utf16ScannerDecoder
)
DecodeRune
(
data
[]
byte
)
(
rune
,
error
)
{
return
utf16
.
Decode
([]
uint16
{
e
.
ByteOrder
.
Uint16
(
data
)})[
0
],
nil
}
func
scanBytesN
(
data
[]
byte
,
n
int
,
atEOF
bool
)
(
int
,
[]
byte
,
error
)
{
if
atEOF
&&
len
(
data
)
==
0
{
return
0
,
nil
,
nil
}
return
n
,
data
[
:
n
],
nil
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment