emitterutils.cpp 12 KB
Newer Older
1
2
3
#include "emitterutils.h"
#include "exp.h"
#include "indentation.h"
4
#include "yaml-cpp/binary.h"
5
#include "yaml-cpp/exceptions.h"
6
#include "stringsource.h"
7
8
#include <sstream>
#include <iomanip>
9
10
11
12
13
14

namespace YAML
{
	namespace Utils
	{
		namespace {
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
			enum {REPLACEMENT_CHARACTER = 0xFFFD};

			bool IsAnchorChar(int ch) { // test for ns-anchor-char
				switch (ch) {
					case ',': case '[': case ']': case '{': case '}': // c-flow-indicator
					case ' ': case '\t': // s-white
					case 0xFEFF: // c-byte-order-mark
					case 0xA: case 0xD: // b-char
						return false;
					case 0x85:
						return true;
				}

				if (ch < 0x20)
					return false;

				if (ch < 0x7E)
					return true;

				if (ch < 0xA0)
					return false;
				if (ch >= 0xD800 && ch <= 0xDFFF)
					return false;
				if ((ch & 0xFFFE) == 0xFFFE)
					return false;
				if ((ch >= 0xFDD0) && (ch <= 0xFDEF))
					return false;
				if (ch > 0x10FFFF)
					return false;

				return true;
			}
			
			int Utf8BytesIndicated(char ch) {
				int byteVal = static_cast<unsigned char>(ch);
				switch (byteVal >> 4) {
					case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
						return 1;
					case 12: case 13:
						return 2;
					case 14:
						return 3;
					case 15:
						return 4;
					default:
					  return -1;
				}
			}

			bool IsTrailingByte(char ch) {
				return (ch & 0xC0) == 0x80;
66
67
			}
			
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
			bool GetNextCodePointAndAdvance(int& codePoint, std::string::const_iterator& first, std::string::const_iterator last) {
				if (first == last)
					return false;
				
				int nBytes = Utf8BytesIndicated(*first);
				if (nBytes < 1) {
					// Bad lead byte
					++first;
					codePoint = REPLACEMENT_CHARACTER;
					return true;
				}
				
				if (nBytes == 1) {
					codePoint = *first++;
					return true;
				}
				
				// Gather bits from trailing bytes
				codePoint = static_cast<unsigned char>(*first) & ~(0xFF << (7 - nBytes));
				++first;
				--nBytes;
				for (; nBytes > 0; ++first, --nBytes) {
					if ((first == last) || !IsTrailingByte(*first)) {
						codePoint = REPLACEMENT_CHARACTER;
						break;
					}
					codePoint <<= 6;
					codePoint |= *first & 0x3F;
				}

				// Check for illegal code points
				if (codePoint > 0x10FFFF)
					codePoint = REPLACEMENT_CHARACTER;
				else if (codePoint >= 0xD800 && codePoint <= 0xDFFF)
					codePoint = REPLACEMENT_CHARACTER;
				else if ((codePoint & 0xFFFE) == 0xFFFE)
					codePoint = REPLACEMENT_CHARACTER;
				else if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF)
					codePoint = REPLACEMENT_CHARACTER;
				return true;
			}
			
			void WriteCodePoint(ostream& out, int codePoint) {
				if (codePoint < 0 || codePoint > 0x10FFFF) {
					codePoint = REPLACEMENT_CHARACTER;
				}
				if (codePoint < 0x7F) {
					out << static_cast<char>(codePoint);
				} else if (codePoint < 0x7FF) {
					out << static_cast<char>(0xC0 | (codePoint >> 6))
					    << static_cast<char>(0x80 | (codePoint & 0x3F));
				} else if (codePoint < 0xFFFF) {
					out << static_cast<char>(0xE0 | (codePoint >> 12))
					    << static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F))
					    << static_cast<char>(0x80 | (codePoint & 0x3F));
				} else {
					out << static_cast<char>(0xF0 | (codePoint >> 18))
					    << static_cast<char>(0x80 | ((codePoint >> 12) & 0x3F))
					    << static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F))
					    << static_cast<char>(0x80 | (codePoint & 0x3F));
				}
			}
			
131
			bool IsValidPlainScalar(const std::string& str, FlowType::value flowType, bool allowOnlyAscii) {
132
133
134
				if(str.empty())
					return false;
				
135
				// first check the start
136
				const RegEx& start = (flowType == FlowType::Flow ? Exp::PlainScalarInFlow() : Exp::PlainScalar());
137
138
139
140
141
142
143
144
				if(!start.Matches(str))
					return false;
				
				// and check the end for plain whitespace (which can't be faithfully kept in a plain scalar)
				if(!str.empty() && *str.rbegin() == ' ')
					return false;

				// then check until something is disallowed
145
				const RegEx& disallowed = (flowType == FlowType::Flow ? Exp::EndScalarInFlow() : Exp::EndScalar())
146
147
148
149
150
				                          || (Exp::BlankOrBreak() + Exp::Comment())
				                          || Exp::NotPrintable()
				                          || Exp::Utf8_ByteOrderMark()
				                          || Exp::Break()
				                          || Exp::Tab();
151
152
				StringCharSource buffer(str.c_str(), str.size());
				while(buffer) {
153
154
					if(disallowed.Matches(buffer))
						return false;
155
					if(allowOnlyAscii && (0x80 <= static_cast<unsigned char>(buffer[0]))) 
156
						return false;
157
158
159
160
161
					++buffer;
				}
				
				return true;
			}
Jesse Beder's avatar
Jesse Beder committed
162

163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
			bool IsValidSingleQuotedScalar(const std::string& str, bool escapeNonAscii)
            {
                // TODO: check for non-printable characters?
                for(std::size_t i=0;i<str.size();i++) {
                    if(escapeNonAscii && (0x80 <= static_cast<unsigned char>(str[i])))
                       return false;
                    if(str[i] == '\n')
                        return false;
                }
                return true;
			}
            
            bool IsValidLiteralScalar(const std::string& str, FlowType::value flowType, bool escapeNonAscii)
            {
                if(flowType == FlowType::Flow)
                    return false;

                // TODO: check for non-printable characters?
                for(std::size_t i=0;i<str.size();i++) {
                    if(escapeNonAscii && (0x80 <= static_cast<unsigned char>(str[i])))
                        return false;
                }
                return true;
            }
			
            void WriteDoubleQuoteEscapeSequence(ostream& out, int codePoint) {
189
				static const char hexDigits[] = "0123456789abcdef";
Jesse Beder's avatar
Jesse Beder committed
190

191
192
193
194
195
196
197
198
				char escSeq[] = "\\U00000000";
				int digits = 8;
				if (codePoint < 0xFF) {
					escSeq[1] = 'x';
					digits = 2;
				} else if (codePoint < 0xFFFF) {
					escSeq[1] = 'u';
					digits = 4;
Jesse Beder's avatar
Jesse Beder committed
199
				}
Jesse Beder's avatar
Jesse Beder committed
200

201
202
203
204
				// Write digits into the escape sequence
				int i = 2;
				for (; digits > 0; --digits, ++i) {
					escSeq[i] = hexDigits[(codePoint >> (4 * (digits - 1))) & 0xF];
Jesse Beder's avatar
Jesse Beder committed
205
				}
206
207
208

				escSeq[i] = 0; // terminate with NUL character
				out << escSeq;
Jesse Beder's avatar
Jesse Beder committed
209
210
			}

211
212
213
214
215
216
217
218
			bool WriteAliasName(ostream& out, const std::string& str) {
				int codePoint;
				for(std::string::const_iterator i = str.begin();
					GetNextCodePointAndAdvance(codePoint, i, str.end());
					)
				{
					if (!IsAnchorChar(codePoint))
						return false;
Jesse Beder's avatar
Jesse Beder committed
219

220
					WriteCodePoint(out, codePoint);
Jesse Beder's avatar
Jesse Beder committed
221
				}
222
				return true;
Jesse Beder's avatar
Jesse Beder committed
223
			}
224
225
		}
		
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
        StringFormat::value ComputeStringFormat(const std::string& str, EMITTER_MANIP strFormat, FlowType::value flowType, bool escapeNonAscii)
        {
            switch(strFormat) {
                case Auto:
                    if(IsValidPlainScalar(str, flowType, escapeNonAscii))
                        return StringFormat::Plain;
                    return StringFormat::DoubleQuoted;
                case SingleQuoted:
                    if(IsValidSingleQuotedScalar(str, escapeNonAscii))
                        return StringFormat::SingleQuoted;
                    return StringFormat::DoubleQuoted;
                case DoubleQuoted:
                    return StringFormat::DoubleQuoted;
                case Literal:
                    if(IsValidLiteralScalar(str, flowType, escapeNonAscii))
                        return StringFormat::Literal;
                    return StringFormat::DoubleQuoted;
                default:
                    break;
            }

            return StringFormat::DoubleQuoted;
        }

250
251
252
		bool WriteSingleQuotedString(ostream& out, const std::string& str)
		{
			out << "'";
253
254
255
256
257
258
259
260
261
			int codePoint;
			for(std::string::const_iterator i = str.begin();
				GetNextCodePointAndAdvance(codePoint, i, str.end());
				) 
			{
				if (codePoint == '\n')
					return false;  // We can't handle a new line and the attendant indentation yet

				if (codePoint == '\'')
262
263
					out << "''";
				else
264
					WriteCodePoint(out, codePoint);
265
266
267
268
269
			}
			out << "'";
			return true;
		}
		
270
		bool WriteDoubleQuotedString(ostream& out, const std::string& str, bool escapeNonAscii)
271
272
		{
			out << "\"";
273
274
275
276
277
			int codePoint;
			for(std::string::const_iterator i = str.begin();
				GetNextCodePointAndAdvance(codePoint, i, str.end());
				) 
			{
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
                switch(codePoint) {
                    case '\"': out << "\\\""; break;
                    case '\\': out << "\\\\"; break;
                    case '\n': out << "\\n"; break;
                    case '\t': out << "\\t"; break;
                    case '\r': out << "\\r"; break;
                    case '\b': out << "\\b"; break;
                    default:
                        if(codePoint < 0x20 || (codePoint >= 0x80 && codePoint <= 0xA0)) // Control characters and non-breaking space
                            WriteDoubleQuoteEscapeSequence(out, codePoint);
                        else if (codePoint == 0xFEFF) // Byte order marks (ZWNS) should be escaped (YAML 1.2, sec. 5.2)	
                            WriteDoubleQuoteEscapeSequence(out, codePoint);
                        else if (escapeNonAscii && codePoint > 0x7E)
                            WriteDoubleQuoteEscapeSequence(out, codePoint);
                        else
                            WriteCodePoint(out, codePoint);
                }
295
296
297
298
299
300
301
302
303
			}
			out << "\"";
			return true;
		}

		bool WriteLiteralString(ostream& out, const std::string& str, int indent)
		{
			out << "|\n";
			out << IndentTo(indent);
304
305
306
307
308
309
310
			int codePoint;
			for(std::string::const_iterator i = str.begin();
				GetNextCodePointAndAdvance(codePoint, i, str.end());
				)
			{
				if (codePoint == '\n')
				  out << "\n" << IndentTo(indent);
311
				else
312
				  WriteCodePoint(out, codePoint);
313
314
315
316
			}
			return true;
		}
		
Jesse Beder's avatar
Jesse Beder committed
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
		bool WriteChar(ostream& out, char ch)
		{
			if(('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z'))
				out << ch;
			else if((0x20 <= ch && ch <= 0x7e) || ch == ' ')
				out << "\"" << ch << "\"";
			else if(ch == '\t')
				out << "\"\\t\"";
			else if(ch == '\n')
				out << "\"\\n\"";
			else if(ch == '\b')
				out << "\"\\b\"";
			else {
				out << "\"";
				WriteDoubleQuoteEscapeSequence(out, ch);
				out << "\"";
			}
			return true;
		}

337
338
		bool WriteComment(ostream& out, const std::string& str, int postCommentIndent)
		{
339
			const unsigned curIndent = out.col();
340
			out << "#" << Indentation(postCommentIndent);
341
            out.set_comment();
342
343
344
345
346
			int codePoint;
			for(std::string::const_iterator i = str.begin();
				GetNextCodePointAndAdvance(codePoint, i, str.end());
				)
			{
347
				if(codePoint == '\n') {
348
					out << "\n" << IndentTo(curIndent) << "#" << Indentation(postCommentIndent);
349
350
                    out.set_comment();
				} else {
351
					WriteCodePoint(out, codePoint);
352
                }
353
354
355
356
357
358
359
			}
			return true;
		}

		bool WriteAlias(ostream& out, const std::string& str)
		{
			out << "*";
360
			return WriteAliasName(out, str);
361
362
363
364
365
		}
		
		bool WriteAnchor(ostream& out, const std::string& str)
		{
			out << "&";
366
			return WriteAliasName(out, str);
367
		}
368

369
		bool WriteTag(ostream& out, const std::string& str, bool verbatim)
370
		{
371
			out << (verbatim ? "!<" : "!");
372
			StringCharSource buffer(str.c_str(), str.size());
373
			const RegEx& reValid = verbatim ? Exp::URI() : Exp::Tag();
374
			while(buffer) {
375
				int n = reValid.Match(buffer);
376
377
378
379
380
381
382
383
				if(n <= 0)
					return false;

				while(--n >= 0) {
					out << buffer[0];
					++buffer;
				}
			}
384
385
			if (verbatim)
				out << ">";
386
387
			return true;
		}
388

389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
		bool WriteTagWithPrefix(ostream& out, const std::string& prefix, const std::string& tag)
		{
			out << "!";
			StringCharSource prefixBuffer(prefix.c_str(), prefix.size());
			while(prefixBuffer) {
				int n = Exp::URI().Match(prefixBuffer);
				if(n <= 0)
					return false;
				
				while(--n >= 0) {
					out << prefixBuffer[0];
					++prefixBuffer;
				}
			}

			out << "!";
			StringCharSource tagBuffer(tag.c_str(), tag.size());
			while(tagBuffer) {
				int n = Exp::Tag().Match(tagBuffer);
				if(n <= 0)
					return false;
				
				while(--n >= 0) {
					out << tagBuffer[0];
					++tagBuffer;
				}
			}
			return true;
		}

419
		bool WriteBinary(ostream& out, const Binary& binary)
420
		{
421
            WriteDoubleQuotedString(out, EncodeBase64(binary.data(), binary.size()), false);
422
            return true;
423
		}
424
425
426
	}
}