emitterutils.cpp 8.39 KB
Newer Older
1
2
3
#include "emitterutils.h"
#include "exp.h"
#include "indentation.h"
4
#include "yaml-cpp/exceptions.h"
5
#include "stringsource.h"
6
7
#include <sstream>
#include <iomanip>
8
9
10
11
12
13

namespace YAML
{
	namespace Utils
	{
		namespace {
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
			enum {REPLACEMENT_CHARACTER = 0xFFFD};

			bool IsAnchorChar(int ch) { // test for ns-anchor-char
				switch (ch) {
					case ',': case '[': case ']': case '{': case '}': // c-flow-indicator
					case ' ': case '\t': // s-white
					case 0xFEFF: // c-byte-order-mark
					case 0xA: case 0xD: // b-char
						return false;
					case 0x85:
						return true;
				}

				if (ch < 0x20)
					return false;

				if (ch < 0x7E)
					return true;

				if (ch < 0xA0)
					return false;
				if (ch >= 0xD800 && ch <= 0xDFFF)
					return false;
				if ((ch & 0xFFFE) == 0xFFFE)
					return false;
				if ((ch >= 0xFDD0) && (ch <= 0xFDEF))
					return false;
				if (ch > 0x10FFFF)
					return false;

				return true;
			}
			
			int Utf8BytesIndicated(char ch) {
				int byteVal = static_cast<unsigned char>(ch);
				switch (byteVal >> 4) {
					case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
						return 1;
					case 12: case 13:
						return 2;
					case 14:
						return 3;
					case 15:
						return 4;
					default:
					  return -1;
				}
			}

			bool IsTrailingByte(char ch) {
				return (ch & 0xC0) == 0x80;
65
66
			}
			
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
			bool GetNextCodePointAndAdvance(int& codePoint, std::string::const_iterator& first, std::string::const_iterator last) {
				if (first == last)
					return false;
				
				int nBytes = Utf8BytesIndicated(*first);
				if (nBytes < 1) {
					// Bad lead byte
					++first;
					codePoint = REPLACEMENT_CHARACTER;
					return true;
				}
				
				if (nBytes == 1) {
					codePoint = *first++;
					return true;
				}
				
				// Gather bits from trailing bytes
				codePoint = static_cast<unsigned char>(*first) & ~(0xFF << (7 - nBytes));
				++first;
				--nBytes;
				for (; nBytes > 0; ++first, --nBytes) {
					if ((first == last) || !IsTrailingByte(*first)) {
						codePoint = REPLACEMENT_CHARACTER;
						break;
					}
					codePoint <<= 6;
					codePoint |= *first & 0x3F;
				}

				// Check for illegal code points
				if (codePoint > 0x10FFFF)
					codePoint = REPLACEMENT_CHARACTER;
				else if (codePoint >= 0xD800 && codePoint <= 0xDFFF)
					codePoint = REPLACEMENT_CHARACTER;
				else if ((codePoint & 0xFFFE) == 0xFFFE)
					codePoint = REPLACEMENT_CHARACTER;
				else if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF)
					codePoint = REPLACEMENT_CHARACTER;
				return true;
			}
			
			void WriteCodePoint(ostream& out, int codePoint) {
				if (codePoint < 0 || codePoint > 0x10FFFF) {
					codePoint = REPLACEMENT_CHARACTER;
				}
				if (codePoint < 0x7F) {
					out << static_cast<char>(codePoint);
				} else if (codePoint < 0x7FF) {
					out << static_cast<char>(0xC0 | (codePoint >> 6))
					    << static_cast<char>(0x80 | (codePoint & 0x3F));
				} else if (codePoint < 0xFFFF) {
					out << static_cast<char>(0xE0 | (codePoint >> 12))
					    << static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F))
					    << static_cast<char>(0x80 | (codePoint & 0x3F));
				} else {
					out << static_cast<char>(0xF0 | (codePoint >> 18))
					    << static_cast<char>(0x80 | ((codePoint >> 12) & 0x3F))
					    << static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F))
					    << static_cast<char>(0x80 | (codePoint & 0x3F));
				}
			}
			
			bool IsValidPlainScalar(const std::string& str, bool inFlow, bool allowOnlyAscii) {
131
				// first check the start
132
				const RegEx& start = (inFlow ? Exp::PlainScalarInFlow() : Exp::PlainScalar());
133
134
135
136
137
138
139
140
				if(!start.Matches(str))
					return false;
				
				// and check the end for plain whitespace (which can't be faithfully kept in a plain scalar)
				if(!str.empty() && *str.rbegin() == ' ')
					return false;

				// then check until something is disallowed
141
142
143
144
145
146
				const RegEx& disallowed = (inFlow ? Exp::EndScalarInFlow() : Exp::EndScalar())
				                          || (Exp::BlankOrBreak() + Exp::Comment())
				                          || Exp::NotPrintable()
				                          || Exp::Utf8_ByteOrderMark()
				                          || Exp::Break()
				                          || Exp::Tab();
147
148
				StringCharSource buffer(str.c_str(), str.size());
				while(buffer) {
149
150
					if(disallowed.Matches(buffer))
						return false;
151
152
					if(allowOnlyAscii && (0x7F < static_cast<unsigned char>(buffer[0]))) 
						return false;
153
154
155
156
157
					++buffer;
				}
				
				return true;
			}
Jesse Beder's avatar
Jesse Beder committed
158

159
160
			void WriteDoubleQuoteEscapeSequence(ostream& out, int codePoint) {
				static const char hexDigits[] = "0123456789abcdef";
Jesse Beder's avatar
Jesse Beder committed
161

162
163
164
165
166
167
168
169
				char escSeq[] = "\\U00000000";
				int digits = 8;
				if (codePoint < 0xFF) {
					escSeq[1] = 'x';
					digits = 2;
				} else if (codePoint < 0xFFFF) {
					escSeq[1] = 'u';
					digits = 4;
Jesse Beder's avatar
Jesse Beder committed
170
				}
Jesse Beder's avatar
Jesse Beder committed
171

172
173
174
175
				// Write digits into the escape sequence
				int i = 2;
				for (; digits > 0; --digits, ++i) {
					escSeq[i] = hexDigits[(codePoint >> (4 * (digits - 1))) & 0xF];
Jesse Beder's avatar
Jesse Beder committed
176
				}
177
178
179

				escSeq[i] = 0; // terminate with NUL character
				out << escSeq;
Jesse Beder's avatar
Jesse Beder committed
180
181
			}

182
183
184
185
186
187
188
189
			bool WriteAliasName(ostream& out, const std::string& str) {
				int codePoint;
				for(std::string::const_iterator i = str.begin();
					GetNextCodePointAndAdvance(codePoint, i, str.end());
					)
				{
					if (!IsAnchorChar(codePoint))
						return false;
Jesse Beder's avatar
Jesse Beder committed
190

191
					WriteCodePoint(out, codePoint);
Jesse Beder's avatar
Jesse Beder committed
192
				}
193
				return true;
Jesse Beder's avatar
Jesse Beder committed
194
			}
195
196
		}
		
197
		bool WriteString(ostream& out, const std::string& str, bool inFlow, bool escapeNonAscii)
198
		{
199
			if(IsValidPlainScalar(str, inFlow, escapeNonAscii)) {
200
201
202
				out << str;
				return true;
			} else
203
				return WriteDoubleQuotedString(out, str, escapeNonAscii);
204
205
206
207
208
		}
		
		bool WriteSingleQuotedString(ostream& out, const std::string& str)
		{
			out << "'";
209
210
211
212
213
214
215
216
217
			int codePoint;
			for(std::string::const_iterator i = str.begin();
				GetNextCodePointAndAdvance(codePoint, i, str.end());
				) 
			{
				if (codePoint == '\n')
					return false;  // We can't handle a new line and the attendant indentation yet

				if (codePoint == '\'')
218
219
					out << "''";
				else
220
					WriteCodePoint(out, codePoint);
221
222
223
224
225
			}
			out << "'";
			return true;
		}
		
226
		bool WriteDoubleQuotedString(ostream& out, const std::string& str, bool escapeNonAscii)
227
228
		{
			out << "\"";
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
			int codePoint;
			for(std::string::const_iterator i = str.begin();
				GetNextCodePointAndAdvance(codePoint, i, str.end());
				) 
			{
				if (codePoint == '\"')
					out << "\\\"";
				else if (codePoint == '\\')
					out << "\\\\";
				else if (codePoint < 0x20 || (codePoint >= 0x80 && codePoint <= 0xA0)) // Control characters and non-breaking space
					WriteDoubleQuoteEscapeSequence(out, codePoint);
				else if (codePoint == 0xFEFF) // Byte order marks (ZWNS) should be escaped (YAML 1.2, sec. 5.2)	
					WriteDoubleQuoteEscapeSequence(out, codePoint);
				else if (escapeNonAscii && codePoint > 0x7E)
					WriteDoubleQuoteEscapeSequence(out, codePoint);
				else
					WriteCodePoint(out, codePoint);
246
247
248
249
250
251
252
253
254
			}
			out << "\"";
			return true;
		}

		bool WriteLiteralString(ostream& out, const std::string& str, int indent)
		{
			out << "|\n";
			out << IndentTo(indent);
255
256
257
258
259
260
261
			int codePoint;
			for(std::string::const_iterator i = str.begin();
				GetNextCodePointAndAdvance(codePoint, i, str.end());
				)
			{
				if (codePoint == '\n')
				  out << "\n" << IndentTo(indent);
262
				else
263
				  WriteCodePoint(out, codePoint);
264
265
266
267
268
269
270
271
			}
			return true;
		}
		
		bool WriteComment(ostream& out, const std::string& str, int postCommentIndent)
		{
			unsigned curIndent = out.col();
			out << "#" << Indentation(postCommentIndent);
272
273
274
275
276
277
			int codePoint;
			for(std::string::const_iterator i = str.begin();
				GetNextCodePointAndAdvance(codePoint, i, str.end());
				)
			{
				if(codePoint == '\n')
278
279
					out << "\n" << IndentTo(curIndent) << "#" << Indentation(postCommentIndent);
				else
280
					WriteCodePoint(out, codePoint);
281
282
283
284
285
286
287
			}
			return true;
		}

		bool WriteAlias(ostream& out, const std::string& str)
		{
			out << "*";
288
			return WriteAliasName(out, str);
289
290
291
292
293
		}
		
		bool WriteAnchor(ostream& out, const std::string& str)
		{
			out << "&";
294
			return WriteAliasName(out, str);
295
		}
296

297
		bool WriteTag(ostream& out, const std::string& str, bool verbatim)
298
		{
299
			out << (verbatim ? "!<" : "!");
300
			StringCharSource buffer(str.c_str(), str.size());
301
			const RegEx& reValid = verbatim ? Exp::URI() : Exp::Tag();
302
			while(buffer) {
303
				int n = reValid.Match(buffer);
304
305
306
307
308
309
310
311
				if(n <= 0)
					return false;

				while(--n >= 0) {
					out << buffer[0];
					++buffer;
				}
			}
312
313
			if (verbatim)
				out << ">";
314
315
			return true;
		}
316
317
318
	}
}