emitterutils.cpp 12.1 KB
Newer Older
1
2
3
#include "emitterutils.h"
#include "exp.h"
#include "indentation.h"
4
#include "yaml-cpp/binary.h"
5
#include "yaml-cpp/exceptions.h"
6
#include "stringsource.h"
7
8
#include <sstream>
#include <iomanip>
9
10
11
12
13
14

namespace YAML
{
	namespace Utils
	{
		namespace {
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
			enum {REPLACEMENT_CHARACTER = 0xFFFD};

			bool IsAnchorChar(int ch) { // test for ns-anchor-char
				switch (ch) {
					case ',': case '[': case ']': case '{': case '}': // c-flow-indicator
					case ' ': case '\t': // s-white
					case 0xFEFF: // c-byte-order-mark
					case 0xA: case 0xD: // b-char
						return false;
					case 0x85:
						return true;
				}

				if (ch < 0x20)
					return false;

				if (ch < 0x7E)
					return true;

				if (ch < 0xA0)
					return false;
				if (ch >= 0xD800 && ch <= 0xDFFF)
					return false;
				if ((ch & 0xFFFE) == 0xFFFE)
					return false;
				if ((ch >= 0xFDD0) && (ch <= 0xFDEF))
					return false;
				if (ch > 0x10FFFF)
					return false;

				return true;
			}
			
			int Utf8BytesIndicated(char ch) {
				int byteVal = static_cast<unsigned char>(ch);
				switch (byteVal >> 4) {
					case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
						return 1;
					case 12: case 13:
						return 2;
					case 14:
						return 3;
					case 15:
						return 4;
					default:
					  return -1;
				}
			}

			bool IsTrailingByte(char ch) {
				return (ch & 0xC0) == 0x80;
66
67
			}
			
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
			bool GetNextCodePointAndAdvance(int& codePoint, std::string::const_iterator& first, std::string::const_iterator last) {
				if (first == last)
					return false;
				
				int nBytes = Utf8BytesIndicated(*first);
				if (nBytes < 1) {
					// Bad lead byte
					++first;
					codePoint = REPLACEMENT_CHARACTER;
					return true;
				}
				
				if (nBytes == 1) {
					codePoint = *first++;
					return true;
				}
				
				// Gather bits from trailing bytes
				codePoint = static_cast<unsigned char>(*first) & ~(0xFF << (7 - nBytes));
				++first;
				--nBytes;
				for (; nBytes > 0; ++first, --nBytes) {
					if ((first == last) || !IsTrailingByte(*first)) {
						codePoint = REPLACEMENT_CHARACTER;
						break;
					}
					codePoint <<= 6;
					codePoint |= *first & 0x3F;
				}

				// Check for illegal code points
				if (codePoint > 0x10FFFF)
					codePoint = REPLACEMENT_CHARACTER;
				else if (codePoint >= 0xD800 && codePoint <= 0xDFFF)
					codePoint = REPLACEMENT_CHARACTER;
				else if ((codePoint & 0xFFFE) == 0xFFFE)
					codePoint = REPLACEMENT_CHARACTER;
				else if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF)
					codePoint = REPLACEMENT_CHARACTER;
				return true;
			}
			
110
			void WriteCodePoint(ostream_wrapper& out, int codePoint) {
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
				if (codePoint < 0 || codePoint > 0x10FFFF) {
					codePoint = REPLACEMENT_CHARACTER;
				}
				if (codePoint < 0x7F) {
					out << static_cast<char>(codePoint);
				} else if (codePoint < 0x7FF) {
					out << static_cast<char>(0xC0 | (codePoint >> 6))
					    << static_cast<char>(0x80 | (codePoint & 0x3F));
				} else if (codePoint < 0xFFFF) {
					out << static_cast<char>(0xE0 | (codePoint >> 12))
					    << static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F))
					    << static_cast<char>(0x80 | (codePoint & 0x3F));
				} else {
					out << static_cast<char>(0xF0 | (codePoint >> 18))
					    << static_cast<char>(0x80 | ((codePoint >> 12) & 0x3F))
					    << static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F))
					    << static_cast<char>(0x80 | (codePoint & 0x3F));
				}
			}
			
131
			bool IsValidPlainScalar(const std::string& str, FlowType::value flowType, bool allowOnlyAscii) {
132
133
134
				if(str.empty())
					return false;
				
135
				// first check the start
136
				const RegEx& start = (flowType == FlowType::Flow ? Exp::PlainScalarInFlow() : Exp::PlainScalar());
137
138
139
140
141
142
143
144
				if(!start.Matches(str))
					return false;
				
				// and check the end for plain whitespace (which can't be faithfully kept in a plain scalar)
				if(!str.empty() && *str.rbegin() == ' ')
					return false;

				// then check until something is disallowed
145
				const RegEx& disallowed = (flowType == FlowType::Flow ? Exp::EndScalarInFlow() : Exp::EndScalar())
146
147
148
149
150
				                          || (Exp::BlankOrBreak() + Exp::Comment())
				                          || Exp::NotPrintable()
				                          || Exp::Utf8_ByteOrderMark()
				                          || Exp::Break()
				                          || Exp::Tab();
151
152
				StringCharSource buffer(str.c_str(), str.size());
				while(buffer) {
153
154
					if(disallowed.Matches(buffer))
						return false;
155
					if(allowOnlyAscii && (0x80 <= static_cast<unsigned char>(buffer[0]))) 
156
						return false;
157
158
159
160
161
					++buffer;
				}
				
				return true;
			}
Jesse Beder's avatar
Jesse Beder committed
162

163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
			bool IsValidSingleQuotedScalar(const std::string& str, bool escapeNonAscii)
            {
                // TODO: check for non-printable characters?
                for(std::size_t i=0;i<str.size();i++) {
                    if(escapeNonAscii && (0x80 <= static_cast<unsigned char>(str[i])))
                       return false;
                    if(str[i] == '\n')
                        return false;
                }
                return true;
			}
            
            bool IsValidLiteralScalar(const std::string& str, FlowType::value flowType, bool escapeNonAscii)
            {
                if(flowType == FlowType::Flow)
                    return false;

                // TODO: check for non-printable characters?
                for(std::size_t i=0;i<str.size();i++) {
                    if(escapeNonAscii && (0x80 <= static_cast<unsigned char>(str[i])))
                        return false;
                }
                return true;
            }
			
188
            void WriteDoubleQuoteEscapeSequence(ostream_wrapper& out, int codePoint) {
189
				static const char hexDigits[] = "0123456789abcdef";
Jesse Beder's avatar
Jesse Beder committed
190

191
                out << "\\";
192
				int digits = 8;
193
194
				if(codePoint < 0xFF) {
                    out << "x";
195
					digits = 2;
196
197
				} else if(codePoint < 0xFFFF) {
                    out << "u";
198
					digits = 4;
199
200
201
202
				} else {
                    out << "U";
                    digits = 8;
                }
Jesse Beder's avatar
Jesse Beder committed
203

204
				// Write digits into the escape sequence
205
206
				for (; digits > 0; --digits)
					out << hexDigits[(codePoint >> (4 * (digits - 1))) & 0xF];
Jesse Beder's avatar
Jesse Beder committed
207
208
			}

209
			bool WriteAliasName(ostream_wrapper& out, const std::string& str) {
210
211
212
213
214
215
216
				int codePoint;
				for(std::string::const_iterator i = str.begin();
					GetNextCodePointAndAdvance(codePoint, i, str.end());
					)
				{
					if (!IsAnchorChar(codePoint))
						return false;
Jesse Beder's avatar
Jesse Beder committed
217

218
					WriteCodePoint(out, codePoint);
Jesse Beder's avatar
Jesse Beder committed
219
				}
220
				return true;
Jesse Beder's avatar
Jesse Beder committed
221
			}
222
223
		}
		
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
        StringFormat::value ComputeStringFormat(const std::string& str, EMITTER_MANIP strFormat, FlowType::value flowType, bool escapeNonAscii)
        {
            switch(strFormat) {
                case Auto:
                    if(IsValidPlainScalar(str, flowType, escapeNonAscii))
                        return StringFormat::Plain;
                    return StringFormat::DoubleQuoted;
                case SingleQuoted:
                    if(IsValidSingleQuotedScalar(str, escapeNonAscii))
                        return StringFormat::SingleQuoted;
                    return StringFormat::DoubleQuoted;
                case DoubleQuoted:
                    return StringFormat::DoubleQuoted;
                case Literal:
                    if(IsValidLiteralScalar(str, flowType, escapeNonAscii))
                        return StringFormat::Literal;
                    return StringFormat::DoubleQuoted;
                default:
                    break;
            }

            return StringFormat::DoubleQuoted;
        }

248
		bool WriteSingleQuotedString(ostream_wrapper& out, const std::string& str)
249
250
		{
			out << "'";
251
252
253
254
255
256
257
258
259
			int codePoint;
			for(std::string::const_iterator i = str.begin();
				GetNextCodePointAndAdvance(codePoint, i, str.end());
				) 
			{
				if (codePoint == '\n')
					return false;  // We can't handle a new line and the attendant indentation yet

				if (codePoint == '\'')
260
261
					out << "''";
				else
262
					WriteCodePoint(out, codePoint);
263
264
265
266
267
			}
			out << "'";
			return true;
		}
		
268
		bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str, bool escapeNonAscii)
269
270
		{
			out << "\"";
271
272
273
274
275
			int codePoint;
			for(std::string::const_iterator i = str.begin();
				GetNextCodePointAndAdvance(codePoint, i, str.end());
				) 
			{
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
                switch(codePoint) {
                    case '\"': out << "\\\""; break;
                    case '\\': out << "\\\\"; break;
                    case '\n': out << "\\n"; break;
                    case '\t': out << "\\t"; break;
                    case '\r': out << "\\r"; break;
                    case '\b': out << "\\b"; break;
                    default:
                        if(codePoint < 0x20 || (codePoint >= 0x80 && codePoint <= 0xA0)) // Control characters and non-breaking space
                            WriteDoubleQuoteEscapeSequence(out, codePoint);
                        else if (codePoint == 0xFEFF) // Byte order marks (ZWNS) should be escaped (YAML 1.2, sec. 5.2)	
                            WriteDoubleQuoteEscapeSequence(out, codePoint);
                        else if (escapeNonAscii && codePoint > 0x7E)
                            WriteDoubleQuoteEscapeSequence(out, codePoint);
                        else
                            WriteCodePoint(out, codePoint);
                }
293
294
295
296
297
			}
			out << "\"";
			return true;
		}

298
		bool WriteLiteralString(ostream_wrapper& out, const std::string& str, int indent)
299
300
301
		{
			out << "|\n";
			out << IndentTo(indent);
302
303
304
305
306
307
308
			int codePoint;
			for(std::string::const_iterator i = str.begin();
				GetNextCodePointAndAdvance(codePoint, i, str.end());
				)
			{
				if (codePoint == '\n')
				  out << "\n" << IndentTo(indent);
309
				else
310
				  WriteCodePoint(out, codePoint);
311
312
313
314
			}
			return true;
		}
		
315
		bool WriteChar(ostream_wrapper& out, char ch)
Jesse Beder's avatar
Jesse Beder committed
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
		{
			if(('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z'))
				out << ch;
			else if((0x20 <= ch && ch <= 0x7e) || ch == ' ')
				out << "\"" << ch << "\"";
			else if(ch == '\t')
				out << "\"\\t\"";
			else if(ch == '\n')
				out << "\"\\n\"";
			else if(ch == '\b')
				out << "\"\\b\"";
			else {
				out << "\"";
				WriteDoubleQuoteEscapeSequence(out, ch);
				out << "\"";
			}
			return true;
		}

335
		bool WriteComment(ostream_wrapper& out, const std::string& str, int postCommentIndent)
336
		{
337
			const unsigned curIndent = out.col();
338
			out << "#" << Indentation(postCommentIndent);
339
            out.set_comment();
340
341
342
343
344
			int codePoint;
			for(std::string::const_iterator i = str.begin();
				GetNextCodePointAndAdvance(codePoint, i, str.end());
				)
			{
345
				if(codePoint == '\n') {
346
					out << "\n" << IndentTo(curIndent) << "#" << Indentation(postCommentIndent);
347
348
                    out.set_comment();
				} else {
349
					WriteCodePoint(out, codePoint);
350
                }
351
352
353
354
			}
			return true;
		}

355
		bool WriteAlias(ostream_wrapper& out, const std::string& str)
356
357
		{
			out << "*";
358
			return WriteAliasName(out, str);
359
360
		}
		
361
		bool WriteAnchor(ostream_wrapper& out, const std::string& str)
362
363
		{
			out << "&";
364
			return WriteAliasName(out, str);
365
		}
366

367
		bool WriteTag(ostream_wrapper& out, const std::string& str, bool verbatim)
368
		{
369
			out << (verbatim ? "!<" : "!");
370
			StringCharSource buffer(str.c_str(), str.size());
371
			const RegEx& reValid = verbatim ? Exp::URI() : Exp::Tag();
372
			while(buffer) {
373
				int n = reValid.Match(buffer);
374
375
376
377
378
379
380
381
				if(n <= 0)
					return false;

				while(--n >= 0) {
					out << buffer[0];
					++buffer;
				}
			}
382
383
			if (verbatim)
				out << ">";
384
385
			return true;
		}
386

387
		bool WriteTagWithPrefix(ostream_wrapper& out, const std::string& prefix, const std::string& tag)
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
		{
			out << "!";
			StringCharSource prefixBuffer(prefix.c_str(), prefix.size());
			while(prefixBuffer) {
				int n = Exp::URI().Match(prefixBuffer);
				if(n <= 0)
					return false;
				
				while(--n >= 0) {
					out << prefixBuffer[0];
					++prefixBuffer;
				}
			}

			out << "!";
			StringCharSource tagBuffer(tag.c_str(), tag.size());
			while(tagBuffer) {
				int n = Exp::Tag().Match(tagBuffer);
				if(n <= 0)
					return false;
				
				while(--n >= 0) {
					out << tagBuffer[0];
					++tagBuffer;
				}
			}
			return true;
		}

417
		bool WriteBinary(ostream_wrapper& out, const Binary& binary)
418
		{
419
            WriteDoubleQuotedString(out, EncodeBase64(binary.data(), binary.size()), false);
420
            return true;
421
		}
422
423
424
	}
}