Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
orangecat
ollama
Commits
e87fc720
Unverified
Commit
e87fc720
authored
Jun 13, 2024
by
Michael Yang
Committed by
GitHub
Jun 13, 2024
Browse files
Merge pull request #5025 from ollama/mxyng/revert-parser-scan
Revert "proper utf16 support"
parents
c69bc19e
20b9f8e6
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
32 additions
and
58 deletions
+32
-58
parser/parser.go
parser/parser.go
+32
-58
No files found.
parser/parser.go
View file @
e87fc720
...
...
@@ -3,15 +3,12 @@ package parser
import
(
"bufio"
"bytes"
"encoding/binary"
"errors"
"fmt"
"io"
"log/slog"
"strconv"
"strings"
"unicode/utf16"
"unicode/utf8"
"unicode"
)
type
File
struct
{
...
...
@@ -72,29 +69,31 @@ func ParseFile(r io.Reader) (*File, error) {
var
b
bytes
.
Buffer
var
role
string
var
lineCount
int
var
linePos
int
var
utf16
bool
var
f
File
br
:=
bufio
.
NewReader
(
r
)
for
{
r
,
_
,
err
:=
br
.
ReadRune
()
if
errors
.
Is
(
err
,
io
.
EOF
)
{
break
}
else
if
err
!=
nil
{
return
nil
,
err
}
var
sc
scannerDecoder
=
utf8ScannerDecoder
{}
if
bom
,
err
:=
br
.
Peek
(
2
);
err
!=
nil
{
slog
.
Warn
(
"error reading byte-order mark"
,
"error"
,
err
)
}
else
if
bytes
.
Equal
(
bom
,
[]
byte
{
0xFE
,
0xFF
})
{
sc
=
utf16ScannerDecoder
{
binary
.
LittleEndian
}
//nolint:errcheck
br
.
Discard
(
2
)
}
else
if
bytes
.
Equal
(
bom
,
[]
byte
{
0xFF
,
0xFE
})
{
sc
=
utf16ScannerDecoder
{
binary
.
BigEndian
}
//nolint:errcheck
br
.
Discard
(
2
)
}
// the utf16 byte order mark will be read as "unreadable" by ReadRune()
if
isUnreadable
(
r
)
&&
lineCount
==
0
&&
linePos
==
0
{
utf16
=
true
continue
}
scanner
:=
bufio
.
NewScanner
(
br
)
scanner
.
Split
(
sc
.
ScanBytes
)
for
scanner
.
Scan
()
{
r
,
err
:=
sc
.
DecodeRune
(
scanner
.
Bytes
())
if
err
!=
nil
{
return
nil
,
err
// skip the second byte if we're reading utf16
if
utf16
&&
r
==
0
{
continue
}
next
,
r
,
err
:=
parseRuneForState
(
r
,
curr
)
...
...
@@ -104,6 +103,13 @@ func ParseFile(r io.Reader) (*File, error) {
return
nil
,
err
}
if
isNewline
(
r
)
{
lineCount
++
linePos
=
0
}
else
{
linePos
++
}
// process the state transition, some transitions need to be intercepted and redirected
if
next
!=
curr
{
switch
curr
{
...
...
@@ -303,6 +309,10 @@ func isNewline(r rune) bool {
return
r
==
'\r'
||
r
==
'\n'
}
func
isUnreadable
(
r
rune
)
bool
{
return
r
==
unicode
.
ReplacementChar
}
func
isValidMessageRole
(
role
string
)
bool
{
return
role
==
"system"
||
role
==
"user"
||
role
==
"assistant"
}
...
...
@@ -315,39 +325,3 @@ func isValidCommand(cmd string) bool {
return
false
}
}
type
scannerDecoder
interface
{
ScanBytes
(
data
[]
byte
,
atEOF
bool
)
(
advance
int
,
token
[]
byte
,
err
error
)
DecodeRune
([]
byte
)
(
rune
,
error
)
}
type
utf8ScannerDecoder
struct
{}
func
(
utf8ScannerDecoder
)
ScanBytes
(
data
[]
byte
,
atEOF
bool
)
(
advance
int
,
token
[]
byte
,
err
error
)
{
return
scanBytesN
(
data
,
1
,
atEOF
)
}
func
(
utf8ScannerDecoder
)
DecodeRune
(
data
[]
byte
)
(
rune
,
error
)
{
r
,
_
:=
utf8
.
DecodeRune
(
data
)
return
r
,
nil
}
type
utf16ScannerDecoder
struct
{
binary
.
ByteOrder
}
func
(
utf16ScannerDecoder
)
ScanBytes
(
data
[]
byte
,
atEOF
bool
)
(
advance
int
,
token
[]
byte
,
err
error
)
{
return
scanBytesN
(
data
,
2
,
atEOF
)
}
func
(
e
utf16ScannerDecoder
)
DecodeRune
(
data
[]
byte
)
(
rune
,
error
)
{
return
utf16
.
Decode
([]
uint16
{
e
.
ByteOrder
.
Uint16
(
data
)})[
0
],
nil
}
func
scanBytesN
(
data
[]
byte
,
n
int
,
atEOF
bool
)
(
int
,
[]
byte
,
error
)
{
if
atEOF
&&
len
(
data
)
==
0
{
return
0
,
nil
,
nil
}
return
n
,
data
[
:
n
],
nil
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment