Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
0f3cf1d4
Unverified
Commit
0f3cf1d4
authored
Jun 10, 2024
by
Michael Yang
Committed by
GitHub
Jun 10, 2024
Browse files
Merge pull request #4715 from ollama/mxyng/utf16-parser
proper utf16 support
parents
5bc029c5
66ab4877
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
58 additions
and
32 deletions
+58
-32
parser/parser.go
parser/parser.go
+58
-32
No files found.
parser/parser.go
View file @
0f3cf1d4
...
@@ -3,12 +3,15 @@ package parser
...
@@ -3,12 +3,15 @@ package parser
import
(
import
(
"bufio"
"bufio"
"bytes"
"bytes"
"encoding/binary"
"errors"
"errors"
"fmt"
"fmt"
"io"
"io"
"log/slog"
"strconv"
"strconv"
"strings"
"strings"
"unicode"
"unicode/utf16"
"unicode/utf8"
)
)
type
File
struct
{
type
File
struct
{
...
@@ -69,31 +72,29 @@ func ParseFile(r io.Reader) (*File, error) {
...
@@ -69,31 +72,29 @@ func ParseFile(r io.Reader) (*File, error) {
var
b
bytes
.
Buffer
var
b
bytes
.
Buffer
var
role
string
var
role
string
var
lineCount
int
var
linePos
int
var
utf16
bool
var
f
File
var
f
File
br
:=
bufio
.
NewReader
(
r
)
br
:=
bufio
.
NewReader
(
r
)
for
{
r
,
_
,
err
:=
br
.
ReadRune
()
if
errors
.
Is
(
err
,
io
.
EOF
)
{
break
}
else
if
err
!=
nil
{
return
nil
,
err
}
// the utf16 byte order mark will be read as "unreadable" by ReadRune()
var
sc
scannerDecoder
=
utf8ScannerDecoder
{}
if
isUnreadable
(
r
)
&&
lineCount
==
0
&&
linePos
==
0
{
if
bom
,
err
:=
br
.
Peek
(
2
);
err
!=
nil
{
utf16
=
true
slog
.
Warn
(
"error reading byte-order mark"
,
"error"
,
err
)
continue
}
else
if
bytes
.
Equal
(
bom
,
[]
byte
{
0xFE
,
0xFF
})
{
}
sc
=
utf16ScannerDecoder
{
binary
.
LittleEndian
}
//nolint:errcheck
br
.
Discard
(
2
)
}
else
if
bytes
.
Equal
(
bom
,
[]
byte
{
0xFF
,
0xFE
})
{
sc
=
utf16ScannerDecoder
{
binary
.
BigEndian
}
//nolint:errcheck
br
.
Discard
(
2
)
}
// skip the second byte if we're reading utf16
scanner
:=
bufio
.
NewScanner
(
br
)
if
utf16
&&
r
==
0
{
scanner
.
Split
(
sc
.
ScanBytes
)
continue
for
scanner
.
Scan
()
{
r
,
err
:=
sc
.
DecodeRune
(
scanner
.
Bytes
())
if
err
!=
nil
{
return
nil
,
err
}
}
next
,
r
,
err
:=
parseRuneForState
(
r
,
curr
)
next
,
r
,
err
:=
parseRuneForState
(
r
,
curr
)
...
@@ -103,13 +104,6 @@ func ParseFile(r io.Reader) (*File, error) {
...
@@ -103,13 +104,6 @@ func ParseFile(r io.Reader) (*File, error) {
return
nil
,
err
return
nil
,
err
}
}
if
isNewline
(
r
)
{
lineCount
++
linePos
=
0
}
else
{
linePos
++
}
// process the state transition, some transitions need to be intercepted and redirected
// process the state transition, some transitions need to be intercepted and redirected
if
next
!=
curr
{
if
next
!=
curr
{
switch
curr
{
switch
curr
{
...
@@ -309,10 +303,6 @@ func isNewline(r rune) bool {
...
@@ -309,10 +303,6 @@ func isNewline(r rune) bool {
return
r
==
'\r'
||
r
==
'\n'
return
r
==
'\r'
||
r
==
'\n'
}
}
func
isUnreadable
(
r
rune
)
bool
{
return
r
==
unicode
.
ReplacementChar
}
func
isValidMessageRole
(
role
string
)
bool
{
func
isValidMessageRole
(
role
string
)
bool
{
return
role
==
"system"
||
role
==
"user"
||
role
==
"assistant"
return
role
==
"system"
||
role
==
"user"
||
role
==
"assistant"
}
}
...
@@ -325,3 +315,39 @@ func isValidCommand(cmd string) bool {
...
@@ -325,3 +315,39 @@ func isValidCommand(cmd string) bool {
return
false
return
false
}
}
}
}
type
scannerDecoder
interface
{
ScanBytes
(
data
[]
byte
,
atEOF
bool
)
(
advance
int
,
token
[]
byte
,
err
error
)
DecodeRune
([]
byte
)
(
rune
,
error
)
}
type
utf8ScannerDecoder
struct
{}
func
(
utf8ScannerDecoder
)
ScanBytes
(
data
[]
byte
,
atEOF
bool
)
(
advance
int
,
token
[]
byte
,
err
error
)
{
return
scanBytesN
(
data
,
1
,
atEOF
)
}
func
(
utf8ScannerDecoder
)
DecodeRune
(
data
[]
byte
)
(
rune
,
error
)
{
r
,
_
:=
utf8
.
DecodeRune
(
data
)
return
r
,
nil
}
type
utf16ScannerDecoder
struct
{
binary
.
ByteOrder
}
func
(
utf16ScannerDecoder
)
ScanBytes
(
data
[]
byte
,
atEOF
bool
)
(
advance
int
,
token
[]
byte
,
err
error
)
{
return
scanBytesN
(
data
,
2
,
atEOF
)
}
func
(
e
utf16ScannerDecoder
)
DecodeRune
(
data
[]
byte
)
(
rune
,
error
)
{
return
utf16
.
Decode
([]
uint16
{
e
.
ByteOrder
.
Uint16
(
data
)})[
0
],
nil
}
func
scanBytesN
(
data
[]
byte
,
n
int
,
atEOF
bool
)
(
int
,
[]
byte
,
error
)
{
if
atEOF
&&
len
(
data
)
==
0
{
return
0
,
nil
,
nil
}
return
n
,
data
[
:
n
],
nil
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment