Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
yaml-cpp
Commits
3e1ba0f3
Commit
3e1ba0f3
authored
Oct 08, 2009
by
Jesse Beder
Browse files
Refactored the UTF-8 emitting
parent
d0b5bf4b
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
91 additions
and
67 deletions
+91
-67
src/emitterutils.cpp
src/emitterutils.cpp
+91
-67
No files found.
src/emitterutils.cpp
View file @
3e1ba0f3
...
@@ -5,6 +5,7 @@
...
@@ -5,6 +5,7 @@
#include "stringsource.h"
#include "stringsource.h"
#include <sstream>
#include <sstream>
#include <iomanip>
#include <iomanip>
#include <cassert>
namespace
YAML
namespace
YAML
{
{
...
@@ -41,16 +42,10 @@ namespace YAML
...
@@ -41,16 +42,10 @@ namespace YAML
return
true
;
return
true
;
}
}
unsigned
ToUnsigned
(
char
ch
)
{
return
static_cast
<
unsigned
int
>
(
static_cast
<
unsigned
char
>
(
ch
));
}
typedef
unsigned
char
byte
;
unsigned
AdvanceAndGetNextChar
(
std
::
string
::
const_iterator
&
it
,
std
::
string
::
const_iterator
end
)
{
byte
ToByte
(
char
ch
)
{
return
static_cast
<
byte
>
(
ch
);
}
std
::
string
::
const_iterator
jt
=
it
;
++
jt
;
if
(
jt
==
end
)
return
0
;
++
it
;
typedef
std
::
string
::
const_iterator
StrIter
;
return
ToUnsigned
(
*
it
);
}
std
::
string
WriteUnicode
(
unsigned
value
)
{
std
::
string
WriteUnicode
(
unsigned
value
)
{
std
::
stringstream
str
;
std
::
stringstream
str
;
...
@@ -64,74 +59,101 @@ namespace YAML
...
@@ -64,74 +59,101 @@ namespace YAML
return
str
.
str
();
return
str
.
str
();
}
}
std
::
string
WriteSingleByte
(
unsigned
ch
)
{
// GetBytesToRead
return
WriteUnicode
(
ch
);
// . Returns the length of the UTF-8 sequence starting with 'signal'
int
GetBytesToRead
(
byte
signal
)
{
if
(
signal
<=
0x7F
)
// ASCII
return
1
;
else
if
(
signal
<=
0xBF
)
// invalid first characters
return
0
;
else
if
(
signal
<=
0xDF
)
// Note: this allows "overlong" UTF8 (0xC0 - 0xC1) to pass unscathed. OK?
return
2
;
else
if
(
signal
<=
0xEF
)
return
3
;
else
return
4
;
}
}
std
::
string
WriteTwoBytes
(
unsigned
ch
,
unsigned
ch1
)
{
// ReadBytes
// Note: if no second byte is provided (signalled by ch1 == 0)
// . Reads the next 'bytesToRead', if we can.
// then we just write the first one as a single byte.
// . Returns zero if we fail, otherwise fills the byte buffer with
// Should we throw an error instead? Or write something else?
// the data and returns the number of bytes read.
// (The same question goes for the other WriteNBytes functions)
int
ReadBytes
(
byte
bytes
[
4
],
StrIter
start
,
StrIter
end
,
int
bytesToRead
)
{
if
(
ch1
==
0
)
for
(
int
i
=
0
;
i
<
bytesToRead
;
i
++
)
{
return
WriteSingleByte
(
ch
);
if
(
start
==
end
)
return
0
;
bytes
[
i
]
=
ToByte
(
*
start
);
++
start
;
}
return
bytesToRead
;
}
unsigned
value
=
((
ch
-
0xC0
)
<<
6
)
+
(
ch1
-
0x80
);
// IsValidUTF8
return
WriteUnicode
(
value
);
// . Assumes bytes[0] is a valid signal byte with the right size passed
bool
IsValidUTF8
(
byte
bytes
[
4
],
int
size
)
{
for
(
int
i
=
1
;
i
<
size
;
i
++
)
if
(
bytes
[
i
]
&
0x80
!=
0x80
)
return
false
;
return
true
;
}
}
std
::
string
WriteThreeBytes
(
unsigned
ch
,
unsigned
ch1
,
unsigned
ch2
)
{
byte
UTF8SignalPrefix
(
int
size
)
{
if
(
ch1
==
0
)
switch
(
size
)
{
return
WriteSingleByte
(
ch
);
case
1
:
return
0
;
if
(
ch2
==
0
)
case
2
:
return
0xC0
;
return
WriteSingleByte
(
ch
)
+
WriteSingleByte
(
ch1
);
case
3
:
return
0xE0
;
case
4
:
return
0xF0
;
}
assert
(
false
);
return
0
;
}
unsigned
value
=
((
ch
-
0xE0
)
<<
12
)
+
((
ch1
-
0x80
)
<<
6
)
+
(
ch2
-
0x80
);
unsigned
UTF8ToUnicode
(
byte
bytes
[
4
],
int
size
)
{
return
WriteUnicode
(
value
);
unsigned
value
=
bytes
[
0
]
-
UTF8SignalPrefix
(
size
);
for
(
int
i
=
1
;
i
<
size
;
i
++
)
value
=
(
value
<<
6
)
+
(
bytes
[
i
]
-
0x80
);
return
value
;
}
}
std
::
string
WriteFourBytes
(
unsigned
ch
,
unsigned
ch1
,
unsigned
ch2
,
unsigned
ch3
)
{
// ReadUTF8
if
(
ch1
==
0
)
// . Returns the Unicode code point starting at 'start',
return
WriteSingleByte
(
ch
);
// and sets 'bytesRead' to the length of the UTF-8 Sequence
if
(
ch2
==
0
)
// . If it's invalid UTF8, we set 'bytesRead' to zero.
return
WriteSingleByte
(
ch
)
+
WriteSingleByte
(
ch1
);
unsigned
ReadUTF8
(
StrIter
start
,
StrIter
end
,
int
&
bytesRead
)
{
if
(
ch3
==
0
)
int
bytesToRead
=
GetBytesToRead
(
ToByte
(
*
start
));
return
WriteSingleByte
(
ch
)
+
WriteSingleByte
(
ch1
)
+
WriteSingleByte
(
ch2
);
if
(
!
bytesToRead
)
{
bytesRead
=
0
;
return
0
;
}
byte
bytes
[
4
];
bytesRead
=
ReadBytes
(
bytes
,
start
,
end
,
bytesToRead
);
if
(
!
bytesRead
)
return
0
;
if
(
!
IsValidUTF8
(
bytes
,
bytesRead
))
{
bytesRead
=
0
;
return
0
;
}
unsigned
value
=
((
ch
-
0xF0
)
<<
18
)
+
((
ch1
-
0x80
)
<<
12
)
+
((
ch2
-
0x80
)
<<
6
)
+
(
ch3
-
0x80
);
return
UTF8ToUnicode
(
bytes
,
bytesRead
);
return
WriteUnicode
(
value
);
}
}
// WriteNonPrintable
// WriteNonPrintable
// . Writes the next UTF-8 code point to the stream
// . Writes the next UTF-8 code point to the stream
std
::
string
::
const_iterator
WriteNonPrintable
(
ostream
&
out
,
std
::
string
::
const_iterator
start
,
std
::
string
::
const_iterator
end
)
{
int
WriteNonPrintable
(
ostream
&
out
,
StrIter
start
,
StrIter
end
)
{
std
::
string
::
const_iterator
it
=
start
;
int
bytesRead
=
0
;
unsigned
ch
=
ToUnsigned
(
*
it
);
unsigned
value
=
ReadUTF8
(
start
,
end
,
bytesRead
);
if
(
ch
<=
0xC1
)
{
// this may include invalid first characters (0x80 - 0xBF)
if
(
bytesRead
==
0
)
{
// or "overlong" UTF-8 (0xC0 - 0xC1)
// TODO: is it ok to just write the replacement character here,
// We just copy them as bytes
// or should we instead write the invalid byte (as \xNN)?
// TODO: should we do something else? throw an error?
out
<<
WriteUnicode
(
0xFFFD
);
out
<<
WriteSingleByte
(
ch
);
return
1
;
return
start
;
}
else
if
(
ch
<=
0xDF
)
{
unsigned
ch1
=
AdvanceAndGetNextChar
(
it
,
end
);
out
<<
WriteTwoBytes
(
ch
,
ch1
);
return
it
;
}
else
if
(
ch
<=
0xEF
)
{
unsigned
ch1
=
AdvanceAndGetNextChar
(
it
,
end
);
unsigned
ch2
=
AdvanceAndGetNextChar
(
it
,
end
);
out
<<
WriteThreeBytes
(
ch
,
ch1
,
ch2
);
return
it
;
}
else
{
unsigned
ch1
=
AdvanceAndGetNextChar
(
it
,
end
);
unsigned
ch2
=
AdvanceAndGetNextChar
(
it
,
end
);
unsigned
ch3
=
AdvanceAndGetNextChar
(
it
,
end
);
out
<<
WriteFourBytes
(
ch
,
ch1
,
ch2
,
ch3
);
return
it
;
}
}
return
start
;
out
<<
WriteUnicode
(
value
);
return
bytesRead
;
}
}
}
}
...
@@ -164,7 +186,7 @@ namespace YAML
...
@@ -164,7 +186,7 @@ namespace YAML
bool
WriteDoubleQuotedString
(
ostream
&
out
,
const
std
::
string
&
str
)
bool
WriteDoubleQuotedString
(
ostream
&
out
,
const
std
::
string
&
str
)
{
{
out
<<
"
\"
"
;
out
<<
"
\"
"
;
for
(
std
::
string
::
const_iterato
r
it
=
str
.
begin
();
it
!=
str
.
end
();
++
it
)
{
for
(
StrIte
r
it
=
str
.
begin
();
it
!=
str
.
end
();
++
it
)
{
char
ch
=
*
it
;
char
ch
=
*
it
;
if
(
IsPrintable
(
ch
))
{
if
(
IsPrintable
(
ch
))
{
if
(
ch
==
'\"'
)
if
(
ch
==
'\"'
)
...
@@ -174,7 +196,9 @@ namespace YAML
...
@@ -174,7 +196,9 @@ namespace YAML
else
else
out
<<
ch
;
out
<<
ch
;
}
else
{
}
else
{
it
=
WriteNonPrintable
(
out
,
it
,
str
.
end
());
int
bytesRead
=
WriteNonPrintable
(
out
,
it
,
str
.
end
());
if
(
bytesRead
>=
1
)
it
+=
(
bytesRead
-
1
);
}
}
}
}
out
<<
"
\"
"
;
out
<<
"
\"
"
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment