1 /**
2 * Copyright (c) 2008-2012, http://www.snakeyaml.org
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.yaml.snakeyaml.scanner;
17
18 import java.nio.ByteBuffer;
19 import java.nio.charset.CharacterCodingException;
20 import java.util.ArrayList;
21 import java.util.HashMap;
22 import java.util.Iterator;
23 import java.util.LinkedHashMap;
24 import java.util.List;
25 import java.util.Map;
26 import java.util.regex.Pattern;
27
28 import org.yaml.snakeyaml.error.Mark;
29 import org.yaml.snakeyaml.error.YAMLException;
30 import org.yaml.snakeyaml.reader.StreamReader;
31 import org.yaml.snakeyaml.tokens.AliasToken;
32 import org.yaml.snakeyaml.tokens.AnchorToken;
33 import org.yaml.snakeyaml.tokens.BlockEndToken;
34 import org.yaml.snakeyaml.tokens.BlockEntryToken;
35 import org.yaml.snakeyaml.tokens.BlockMappingStartToken;
36 import org.yaml.snakeyaml.tokens.BlockSequenceStartToken;
37 import org.yaml.snakeyaml.tokens.DirectiveToken;
38 import org.yaml.snakeyaml.tokens.DocumentEndToken;
39 import org.yaml.snakeyaml.tokens.DocumentStartToken;
40 import org.yaml.snakeyaml.tokens.FlowEntryToken;
41 import org.yaml.snakeyaml.tokens.FlowMappingEndToken;
42 import org.yaml.snakeyaml.tokens.FlowMappingStartToken;
43 import org.yaml.snakeyaml.tokens.FlowSequenceEndToken;
44 import org.yaml.snakeyaml.tokens.FlowSequenceStartToken;
45 import org.yaml.snakeyaml.tokens.KeyToken;
46 import org.yaml.snakeyaml.tokens.ScalarToken;
47 import org.yaml.snakeyaml.tokens.StreamEndToken;
48 import org.yaml.snakeyaml.tokens.StreamStartToken;
49 import org.yaml.snakeyaml.tokens.TagToken;
50 import org.yaml.snakeyaml.tokens.TagTuple;
51 import org.yaml.snakeyaml.tokens.Token;
52 import org.yaml.snakeyaml.tokens.ValueToken;
53 import org.yaml.snakeyaml.util.ArrayStack;
54 import org.yaml.snakeyaml.util.UriEncoder;
55
56 /**
57 * <pre>
58 * Scanner produces tokens of the following types:
59 * STREAM-START
60 * STREAM-END
61 * DIRECTIVE(name, value)
62 * DOCUMENT-START
63 * DOCUMENT-END
64 * BLOCK-SEQUENCE-START
65 * BLOCK-MAPPING-START
66 * BLOCK-END
67 * FLOW-SEQUENCE-START
68 * FLOW-MAPPING-START
69 * FLOW-SEQUENCE-END
70 * FLOW-MAPPING-END
71 * BLOCK-ENTRY
72 * FLOW-ENTRY
73 * KEY
74 * VALUE
75 * ALIAS(value)
76 * ANCHOR(value)
77 * TAG(value)
78 * SCALAR(value, plain, style)
79 * Read comments in the Scanner code for more details.
80 * </pre>
81 */
82 public final class ScannerImpl implements Scanner {
83 /**
84 * A regular expression matching characters which are not in the hexadecimal
85 * set (0-9, A-F, a-f).
86 */
87 private final static Pattern NOT_HEXA = Pattern.compile("[^0-9A-Fa-f]");
88
89 /**
90 * A mapping from an escaped character in the input stream to the character
91 * that they should be replaced with.
92 *
93 * YAML defines several common and a few uncommon escape sequences.
94 *
95 * @see <a href="http://www.yaml.org/spec/current.html#id2517668">4.1.6.
96 * Escape Sequences</a>
97 */
98 public final static Map<Character, String> ESCAPE_REPLACEMENTS = new HashMap<Character, String>();
99
100 /**
101 * A mapping from a character to a number of bytes to read-ahead for that
102 * escape sequence. These escape sequences are used to handle unicode
103 * escaping in the following formats, where H is a hexadecimal character:
104 *
105 * <pre>
106 * \xHH : escaped 8-bit Unicode character
107 * \uHHHH : escaped 16-bit Unicode character
108 * \UHHHHHHHH : escaped 32-bit Unicode character
109 * </pre>
110 *
111 * @see <a href="http://yaml.org/spec/1.1/current.html#id872840">5.6. Escape
112 * Sequences</a>
113 */
114 public final static Map<Character, Integer> ESCAPE_CODES = new HashMap<Character, Integer>();
115
116 static {
117 // ASCII null
118 ESCAPE_REPLACEMENTS.put(Character.valueOf('0'), "\0");
119 // ASCII bell
120 ESCAPE_REPLACEMENTS.put(Character.valueOf('a'), "\u0007");
121 // ASCII backspace
122 ESCAPE_REPLACEMENTS.put(Character.valueOf('b'), "\u0008");
123 // ASCII horizontal tab
124 ESCAPE_REPLACEMENTS.put(Character.valueOf('t'), "\u0009");
125 // ASCII newline (line feed; \n maps to 0x0A)
126 ESCAPE_REPLACEMENTS.put(Character.valueOf('n'), "\n");
127 // ASCII vertical tab
128 ESCAPE_REPLACEMENTS.put(Character.valueOf('v'), "\u000B");
129 // ASCII form-feed
130 ESCAPE_REPLACEMENTS.put(Character.valueOf('f'), "\u000C");
131 // carriage-return (\r maps to 0x0D)
132 ESCAPE_REPLACEMENTS.put(Character.valueOf('r'), "\r");
133 // ASCII escape character (Esc)
134 ESCAPE_REPLACEMENTS.put(Character.valueOf('e'), "\u001B");
135 // ASCII space
136 ESCAPE_REPLACEMENTS.put(Character.valueOf(' '), "\u0020");
137 // ASCII double-quote
138 ESCAPE_REPLACEMENTS.put(Character.valueOf('"'), "\"");
139 // ASCII backslash
140 ESCAPE_REPLACEMENTS.put(Character.valueOf('\\'), "\\");
141 // Unicode next line
142 ESCAPE_REPLACEMENTS.put(Character.valueOf('N'), "\u0085");
143 // Unicode non-breaking-space
144 ESCAPE_REPLACEMENTS.put(Character.valueOf('_'), "\u00A0");
145 // Unicode line-separator
146 ESCAPE_REPLACEMENTS.put(Character.valueOf('L'), "\u2028");
147 // Unicode paragraph separator
148 ESCAPE_REPLACEMENTS.put(Character.valueOf('P'), "\u2029");
149
150 // 8-bit Unicode
151 ESCAPE_CODES.put(Character.valueOf('x'), 2);
152 // 16-bit Unicode
153 ESCAPE_CODES.put(Character.valueOf('u'), 4);
154 // 32-bit Unicode (Supplementary characters are supported)
155 ESCAPE_CODES.put(Character.valueOf('U'), 8);
156 }
157 private final StreamReader reader;
158 // Had we reached the end of the stream?
159 private boolean done = false;
160
161 // The number of unclosed '{' and '['. `flow_level == 0` means block
162 // context.
163 private int flowLevel = 0;
164
165 // List of processed tokens that are not yet emitted.
166 private List<Token> tokens;
167
168 // Number of tokens that were emitted through the `get_token` method.
169 private int tokensTaken = 0;
170
171 // The current indentation level.
172 private int indent = -1;
173
174 // Past indentation levels.
175 private ArrayStack<Integer> indents;
176
177 // Variables related to simple keys treatment. See PyYAML.
178
179 /**
180 * <pre>
181 * A simple key is a key that is not denoted by the '?' indicator.
182 * Example of simple keys:
183 * ---
184 * block simple key: value
185 * ? not a simple key:
186 * : { flow simple key: value }
187 * We emit the KEY token before all keys, so when we find a potential
188 * simple key, we try to locate the corresponding ':' indicator.
189 * Simple keys should be limited to a single line and 1024 characters.
190 *
191 * Can a simple key start at the current position? A simple key may
192 * start:
193 * - at the beginning of the line, not counting indentation spaces
194 * (in block context),
195 * - after '{', '[', ',' (in the flow context),
196 * - after '?', ':', '-' (in the block context).
197 * In the block context, this flag also signifies if a block collection
198 * may start at the current position.
199 * </pre>
200 */
201 private boolean allowSimpleKey = true;
202
203 /*
204 * Keep track of possible simple keys. This is a dictionary. The key is
205 * `flow_level`; there can be no more that one possible simple key for each
206 * level. The value is a SimpleKey record: (token_number, required, index,
207 * line, column, mark) A simple key may start with ALIAS, ANCHOR, TAG,
208 * SCALAR(flow), '[', or '{' tokens.
209 */
210 private Map<Integer, SimpleKey> possibleSimpleKeys;
211
212 public ScannerImpl(StreamReader reader) {
213 this.reader = reader;
214 this.tokens = new ArrayList<Token>(100);
215 this.indents = new ArrayStack<Integer>(10);
216 // The order in possibleSimpleKeys is kept for nextPossibleSimpleKey()
217 this.possibleSimpleKeys = new LinkedHashMap<Integer, SimpleKey>();
218 fetchStreamStart();// Add the STREAM-START token.
219 }
220
221 /**
222 * Check whether the next token is one of the given types.
223 */
224 public boolean checkToken(Token.ID... choices) {
225 while (needMoreTokens()) {
226 fetchMoreTokens();
227 }
228 if (!this.tokens.isEmpty()) {
229 if (choices.length == 0) {
230 return true;
231 }
232 // since profiler puts this method on top (it is used a lot), we
233 // should not use 'foreach' here because of the performance reasons
234 Token.ID first = this.tokens.get(0).getTokenId();
235 for (int i = 0; i < choices.length; i++) {
236 if (first == choices[i]) {
237 return true;
238 }
239 }
240 }
241 return false;
242 }
243
244 /**
245 * Return the next token, but do not delete it from the queue.
246 */
247 public Token peekToken() {
248 while (needMoreTokens()) {
249 fetchMoreTokens();
250 }
251 return this.tokens.get(0);
252 }
253
254 /**
255 * Return the next token, removing it from the queue.
256 */
257 public Token getToken() {
258 if (!this.tokens.isEmpty()) {
259 this.tokensTaken++;
260 return this.tokens.remove(0);
261 }
262 return null;
263 }
264
265 // Private methods.
266 /**
267 * Returns true if more tokens should be scanned.
268 */
269 private boolean needMoreTokens() {
270 // If we are done, we do not require more tokens.
271 if (this.done) {
272 return false;
273 }
274 // If we aren't done, but we have no tokens, we need to scan more.
275 if (this.tokens.isEmpty()) {
276 return true;
277 }
278 // The current token may be a potential simple key, so we
279 // need to look further.
280 stalePossibleSimpleKeys();
281 return nextPossibleSimpleKey() == this.tokensTaken;
282 }
283
284 /**
285 * Fetch one or more tokens from the StreamReader.
286 */
287 private void fetchMoreTokens() {
288 // Eat whitespaces and comments until we reach the next token.
289 scanToNextToken();
290 // Remove obsolete possible simple keys.
291 stalePossibleSimpleKeys();
292 // Compare the current indentation and column. It may add some tokens
293 // and decrease the current indentation level.
294 unwindIndent(reader.getColumn());
295 // Peek the next character, to decide what the next group of tokens
296 // will look like.
297 char ch = reader.peek();
298 switch (ch) {
299 case '\0':
300 // Is it the end of stream?
301 fetchStreamEnd();
302 return;
303 case '%':
304 // Is it a directive?
305 if (checkDirective()) {
306 fetchDirective();
307 return;
308 }
309 break;
310 case '-':
311 // Is it the document start?
312 if (checkDocumentStart()) {
313 fetchDocumentStart();
314 return;
315 // Is it the block entry indicator?
316 } else if (checkBlockEntry()) {
317 fetchBlockEntry();
318 return;
319 }
320 break;
321 case '.':
322 // Is it the document end?
323 if (checkDocumentEnd()) {
324 fetchDocumentEnd();
325 return;
326 }
327 break;
328 // TODO support for BOM within a stream. (not implemented in PyYAML)
329 case '[':
330 // Is it the flow sequence start indicator?
331 fetchFlowSequenceStart();
332 return;
333 case '{':
334 // Is it the flow mapping start indicator?
335 fetchFlowMappingStart();
336 return;
337 case ']':
338 // Is it the flow sequence end indicator?
339 fetchFlowSequenceEnd();
340 return;
341 case '}':
342 // Is it the flow mapping end indicator?
343 fetchFlowMappingEnd();
344 return;
345 case ',':
346 // Is it the flow entry indicator?
347 fetchFlowEntry();
348 return;
349 // see block entry indicator above
350 case '?':
351 // Is it the key indicator?
352 if (checkKey()) {
353 fetchKey();
354 return;
355 }
356 break;
357 case ':':
358 // Is it the value indicator?
359 if (checkValue()) {
360 fetchValue();
361 return;
362 }
363 break;
364 case '*':
365 // Is it an alias?
366 fetchAlias();
367 return;
368 case '&':
369 // Is it an anchor?
370 fetchAnchor();
371 return;
372 case '!':
373 // Is it a tag?
374 fetchTag();
375 return;
376 case '|':
377 // Is it a literal scalar?
378 if (this.flowLevel == 0) {
379 fetchLiteral();
380 return;
381 }
382 break;
383 case '>':
384 // Is it a folded scalar?
385 if (this.flowLevel == 0) {
386 fetchFolded();
387 return;
388 }
389 break;
390 case '\'':
391 // Is it a single quoted scalar?
392 fetchSingle();
393 return;
394 case '"':
395 // Is it a double quoted scalar?
396 fetchDouble();
397 return;
398 }
399 // It must be a plain scalar then.
400 if (checkPlain()) {
401 fetchPlain();
402 return;
403 }
404 // No? It's an error. Let's produce a nice error message.We do this by
405 // converting escaped characters into their escape sequences. This is a
406 // backwards use of the ESCAPE_REPLACEMENTS map.
407 String chRepresentation = String.valueOf(ch);
408 for (Character s : ESCAPE_REPLACEMENTS.keySet()) {
409 String v = ESCAPE_REPLACEMENTS.get(s);
410 if (v.equals(chRepresentation)) {
411 chRepresentation = "\\" + s;// ' ' -> '\t'
412 break;
413 }
414 }
415 throw new ScannerException("while scanning for the next token", null, "found character "
416 + ch + "'" + chRepresentation + "' that cannot start any token", reader.getMark());
417 }
418
419 // Simple keys treatment.
420
421 /**
422 * Return the number of the nearest possible simple key. Actually we don't
423 * need to loop through the whole dictionary.
424 */
425 private int nextPossibleSimpleKey() {
426 /*
427 * the implementation is not as in PyYAML. Because
428 * this.possibleSimpleKeys is ordered we can simply take the first key
429 */
430 if (!this.possibleSimpleKeys.isEmpty()) {
431 return this.possibleSimpleKeys.values().iterator().next().getTokenNumber();
432 }
433 return -1;
434 }
435
436 /**
437 * <pre>
438 * Remove entries that are no longer possible simple keys. According to
439 * the YAML specification, simple keys
440 * - should be limited to a single line,
441 * - should be no longer than 1024 characters.
442 * Disabling this procedure will allow simple keys of any length and
443 * height (may cause problems if indentation is broken though).
444 * </pre>
445 */
446 private void stalePossibleSimpleKeys() {
447 if (!this.possibleSimpleKeys.isEmpty()) {
448 for (Iterator<SimpleKey> iterator = this.possibleSimpleKeys.values().iterator(); iterator
449 .hasNext();) {
450 SimpleKey key = iterator.next();
451 if ((key.getLine() != reader.getLine())
452 || (reader.getIndex() - key.getIndex() > 1024)) {
453 // If the key is not on the same line as the current
454 // position OR the difference in column between the token
455 // start and the current position is more than the maximum
456 // simple key length, then this cannot be a simple key.
457 if (key.isRequired()) {
458 // If the key was required, this implies an error
459 // condition.
460 throw new ScannerException("while scanning a simple key", key.getMark(),
461 "could not found expected ':'", reader.getMark());
462 }
463 iterator.remove();
464 }
465 }
466 }
467 }
468
469 /**
470 * The next token may start a simple key. We check if it's possible and save
471 * its position. This function is called for ALIAS, ANCHOR, TAG,
472 * SCALAR(flow), '[', and '{'.
473 */
474 private void savePossibleSimpleKey() {
475 // The next token may start a simple key. We check if it's possible
476 // and save its position. This function is called for
477 // ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
478
479 // Check if a simple key is required at the current position.
480 // A simple key is required if this position is the root flowLevel, AND
481 // the current indentation level is the same as the last indent-level.
482 boolean required = ((this.flowLevel == 0) && (this.indent == this.reader.getColumn()));
483
484 if (allowSimpleKey || !required) {
485 // A simple key is required only if it is the first token in the
486 // current line. Therefore it is always allowed.
487 } else {
488 throw new YAMLException(
489 "A simple key is required only if it is the first token in the current line");
490 }
491
492 // The next token might be a simple key. Let's save it's number and
493 // position.
494 if (this.allowSimpleKey) {
495 removePossibleSimpleKey();
496 int tokenNumber = this.tokensTaken + this.tokens.size();
497 SimpleKey key = new SimpleKey(tokenNumber, required, reader.getIndex(),
498 reader.getLine(), this.reader.getColumn(), this.reader.getMark());
499 this.possibleSimpleKeys.put(this.flowLevel, key);
500 }
501 }
502
503 /**
504 * Remove the saved possible key position at the current flow level.
505 */
506 private void removePossibleSimpleKey() {
507 SimpleKey key = possibleSimpleKeys.remove(flowLevel);
508 if (key != null && key.isRequired()) {
509 throw new ScannerException("while scanning a simple key", key.getMark(),
510 "could not found expected ':'", reader.getMark());
511 }
512 }
513
514 // Indentation functions.
515
516 /**
517 * * Handle implicitly ending multiple levels of block nodes by decreased
518 * indentation. This function becomes important on lines 4 and 7 of this
519 * example:
520 *
521 * <pre>
522 * 1) book one:
523 * 2) part one:
524 * 3) chapter one
525 * 4) part two:
526 * 5) chapter one
527 * 6) chapter two
528 * 7) book two:
529 * </pre>
530 *
531 * In flow context, tokens should respect indentation. Actually the
532 * condition should be `self.indent >= column` according to the spec. But
533 * this condition will prohibit intuitively correct constructions such as
534 * key : { } </pre>
535 */
536 private void unwindIndent(int col) {
537 // In the flow context, indentation is ignored. We make the scanner less
538 // restrictive then specification requires.
539 if (this.flowLevel != 0) {
540 return;
541 }
542
543 // In block context, we may need to issue the BLOCK-END tokens.
544 while (this.indent > col) {
545 Mark mark = reader.getMark();
546 this.indent = this.indents.pop();
547 this.tokens.add(new BlockEndToken(mark, mark));
548 }
549 }
550
551 /**
552 * Check if we need to increase indentation.
553 */
554 private boolean addIndent(int column) {
555 if (this.indent < column) {
556 this.indents.push(this.indent);
557 this.indent = column;
558 return true;
559 }
560 return false;
561 }
562
563 // Fetchers.
564
565 /**
566 * We always add STREAM-START as the first token and STREAM-END as the last
567 * token.
568 */
569 private void fetchStreamStart() {
570 // Read the token.
571 Mark mark = reader.getMark();
572
573 // Add STREAM-START.
574 Token token = new StreamStartToken(mark, mark);
575 this.tokens.add(token);
576 }
577
578 private void fetchStreamEnd() {
579 // Set the current intendation to -1.
580 unwindIndent(-1);
581
582 // Reset simple keys.
583 removePossibleSimpleKey();
584 this.allowSimpleKey = false;
585 this.possibleSimpleKeys.clear();
586
587 // Read the token.
588 Mark mark = reader.getMark();
589
590 // Add STREAM-END.
591 Token token = new StreamEndToken(mark, mark);
592 this.tokens.add(token);
593
594 // The stream is finished.
595 this.done = true;
596 }
597
598 /**
599 * Fetch a YAML directive. Directives are presentation details that are
600 * interpreted as instructions to the processor. YAML defines two kinds of
601 * directives, YAML and TAG; all other types are reserved for future use.
602 *
603 * @see http://www.yaml.org/spec/1.1/#id864824
604 */
605 private void fetchDirective() {
606 // Set the current intendation to -1.
607 unwindIndent(-1);
608
609 // Reset simple keys.
610 removePossibleSimpleKey();
611 this.allowSimpleKey = false;
612
613 // Scan and add DIRECTIVE.
614 Token tok = scanDirective();
615 this.tokens.add(tok);
616 }
617
618 /**
619 * Fetch a document-start token ("---").
620 */
621 private void fetchDocumentStart() {
622 fetchDocumentIndicator(true);
623 }
624
625 /**
626 * Fetch a document-end token ("...").
627 */
628 private void fetchDocumentEnd() {
629 fetchDocumentIndicator(false);
630 }
631
632 /**
633 * Fetch a document indicator, either "---" for "document-start", or else
634 * "..." for "document-end. The type is chosen by the given boolean.
635 */
636 private void fetchDocumentIndicator(boolean isDocumentStart) {
637 // Set the current intendation to -1.
638 unwindIndent(-1);
639
640 // Reset simple keys. Note that there could not be a block collection
641 // after '---'.
642 removePossibleSimpleKey();
643 this.allowSimpleKey = false;
644
645 // Add DOCUMENT-START or DOCUMENT-END.
646 Mark startMark = reader.getMark();
647 reader.forward(3);
648 Mark endMark = reader.getMark();
649 Token token;
650 if (isDocumentStart) {
651 token = new DocumentStartToken(startMark, endMark);
652 } else {
653 token = new DocumentEndToken(startMark, endMark);
654 }
655 this.tokens.add(token);
656 }
657
658 private void fetchFlowSequenceStart() {
659 fetchFlowCollectionStart(false);
660 }
661
662 private void fetchFlowMappingStart() {
663 fetchFlowCollectionStart(true);
664 }
665
666 /**
667 * Fetch a flow-style collection start, which is either a sequence or a
668 * mapping. The type is determined by the given boolean.
669 *
670 * A flow-style collection is in a format similar to JSON. Sequences are
671 * started by '[' and ended by ']'; mappings are started by '{' and ended by
672 * '}'.
673 *
674 * @see http://www.yaml.org/spec/1.1/#id863975
675 *
676 * @param isMappingStart
677 */
678 private void fetchFlowCollectionStart(boolean isMappingStart) {
679 // '[' and '{' may start a simple key.
680 savePossibleSimpleKey();
681
682 // Increase the flow level.
683 this.flowLevel++;
684
685 // Simple keys are allowed after '[' and '{'.
686 this.allowSimpleKey = true;
687
688 // Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
689 Mark startMark = reader.getMark();
690 reader.forward(1);
691 Mark endMark = reader.getMark();
692 Token token;
693 if (isMappingStart) {
694 token = new FlowMappingStartToken(startMark, endMark);
695 } else {
696 token = new FlowSequenceStartToken(startMark, endMark);
697 }
698 this.tokens.add(token);
699 }
700
701 private void fetchFlowSequenceEnd() {
702 fetchFlowCollectionEnd(false);
703 }
704
705 private void fetchFlowMappingEnd() {
706 fetchFlowCollectionEnd(true);
707 }
708
709 /**
710 * Fetch a flow-style collection end, which is either a sequence or a
711 * mapping. The type is determined by the given boolean.
712 *
713 * A flow-style collection is in a format similar to JSON. Sequences are
714 * started by '[' and ended by ']'; mappings are started by '{' and ended by
715 * '}'.
716 *
717 * @see http://www.yaml.org/spec/1.1/#id863975
718 */
719 private void fetchFlowCollectionEnd(boolean isMappingEnd) {
720 // Reset possible simple key on the current level.
721 removePossibleSimpleKey();
722
723 // Decrease the flow level.
724 this.flowLevel--;
725
726 // No simple keys after ']' or '}'.
727 this.allowSimpleKey = false;
728
729 // Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
730 Mark startMark = reader.getMark();
731 reader.forward();
732 Mark endMark = reader.getMark();
733 Token token;
734 if (isMappingEnd) {
735 token = new FlowMappingEndToken(startMark, endMark);
736 } else {
737 token = new FlowSequenceEndToken(startMark, endMark);
738 }
739 this.tokens.add(token);
740 }
741
742 /**
743 * Fetch an entry in the flow style. Flow-style entries occur either
744 * immediately after the start of a collection, or else after a comma.
745 *
746 * @see http://www.yaml.org/spec/1.1/#id863975
747 */
748 private void fetchFlowEntry() {
749 // Simple keys are allowed after ','.
750 this.allowSimpleKey = true;
751
752 // Reset possible simple key on the current level.
753 removePossibleSimpleKey();
754
755 // Add FLOW-ENTRY.
756 Mark startMark = reader.getMark();
757 reader.forward();
758 Mark endMark = reader.getMark();
759 Token token = new FlowEntryToken(startMark, endMark);
760 this.tokens.add(token);
761 }
762
763 /**
764 * Fetch an entry in the block style.
765 *
766 * @see http://www.yaml.org/spec/1.1/#id863975
767 */
768 private void fetchBlockEntry() {
769 // Block context needs additional checks.
770 if (this.flowLevel == 0) {
771 // Are we allowed to start a new entry?
772 if (!this.allowSimpleKey) {
773 throw new ScannerException(null, null, "sequence entries are not allowed here",
774 reader.getMark());
775 }
776
777 // We may need to add BLOCK-SEQUENCE-START.
778 if (addIndent(this.reader.getColumn())) {
779 Mark mark = reader.getMark();
780 this.tokens.add(new BlockSequenceStartToken(mark, mark));
781 }
782 } else {
783 // It's an error for the block entry to occur in the flow
784 // context,but we let the parser detect this.
785 }
786 // Simple keys are allowed after '-'.
787 this.allowSimpleKey = true;
788
789 // Reset possible simple key on the current level.
790 removePossibleSimpleKey();
791
792 // Add BLOCK-ENTRY.
793 Mark startMark = reader.getMark();
794 reader.forward();
795 Mark endMark = reader.getMark();
796 Token token = new BlockEntryToken(startMark, endMark);
797 this.tokens.add(token);
798 }
799
800 /**
801 * Fetch a key in a block-style mapping.
802 *
803 * @see http://www.yaml.org/spec/1.1/#id863975
804 */
805 private void fetchKey() {
806 // Block context needs additional checks.
807 if (this.flowLevel == 0) {
808 // Are we allowed to start a key (not necessary a simple)?
809 if (!this.allowSimpleKey) {
810 throw new ScannerException(null, null, "mapping keys are not allowed here",
811 reader.getMark());
812 }
813 // We may need to add BLOCK-MAPPING-START.
814 if (addIndent(this.reader.getColumn())) {
815 Mark mark = reader.getMark();
816 this.tokens.add(new BlockMappingStartToken(mark, mark));
817 }
818 }
819 // Simple keys are allowed after '?' in the block context.
820 this.allowSimpleKey = this.flowLevel == 0;
821
822 // Reset possible simple key on the current level.
823 removePossibleSimpleKey();
824
825 // Add KEY.
826 Mark startMark = reader.getMark();
827 reader.forward();
828 Mark endMark = reader.getMark();
829 Token token = new KeyToken(startMark, endMark);
830 this.tokens.add(token);
831 }
832
833 /**
834 * Fetch a value in a block-style mapping.
835 *
836 * @see http://www.yaml.org/spec/1.1/#id863975
837 */
838 private void fetchValue() {
839 // Do we determine a simple key?
840 SimpleKey key = this.possibleSimpleKeys.remove(this.flowLevel);
841 if (key != null) {
842 // Add KEY.
843 this.tokens.add(key.getTokenNumber() - this.tokensTaken, new KeyToken(key.getMark(),
844 key.getMark()));
845
846 // If this key starts a new block mapping, we need to add
847 // BLOCK-MAPPING-START.
848 if (this.flowLevel == 0) {
849 if (addIndent(key.getColumn())) {
850 this.tokens.add(key.getTokenNumber() - this.tokensTaken,
851 new BlockMappingStartToken(key.getMark(), key.getMark()));
852 }
853 }
854 // There cannot be two simple keys one after another.
855 this.allowSimpleKey = false;
856
857 } else {// It must be a part of a complex key.
858 // Block context needs additional checks.Do we really need them?
859 // They
860 // will be catched by the parser anyway.)
861 if (this.flowLevel == 0) {
862
863 // We are allowed to start a complex value if and only if we can
864 // start a simple key.
865 if (!this.allowSimpleKey) {
866 throw new ScannerException(null, null, "mapping values are not allowed here",
867 reader.getMark());
868 }
869 }
870
871 // If this value starts a new block mapping, we need to add
872 // BLOCK-MAPPING-START. It will be detected as an error later by
873 // the parser.
874 if (flowLevel == 0) {
875 if (addIndent(reader.getColumn())) {
876 Mark mark = reader.getMark();
877 this.tokens.add(new BlockMappingStartToken(mark, mark));
878 }
879 }
880
881 // Simple keys are allowed after ':' in the block context.
882 allowSimpleKey = (flowLevel == 0);
883
884 // Reset possible simple key on the current level.
885 removePossibleSimpleKey();
886 }
887 // Add VALUE.
888 Mark startMark = reader.getMark();
889 reader.forward();
890 Mark endMark = reader.getMark();
891 Token token = new ValueToken(startMark, endMark);
892 this.tokens.add(token);
893 }
894
895 /**
896 * Fetch an alias, which is a reference to an anchor. Aliases take the
897 * format:
898 *
899 * <pre>
900 * *(anchor name)
901 * </pre>
902 *
903 * @see http://www.yaml.org/spec/1.1/#id863390
904 */
905 private void fetchAlias() {
906 // ALIAS could be a simple key.
907 savePossibleSimpleKey();
908
909 // No simple keys after ALIAS.
910 this.allowSimpleKey = false;
911
912 // Scan and add ALIAS.
913 Token tok = scanAnchor(false);
914 this.tokens.add(tok);
915 }
916
917 /**
918 * Fetch an anchor. Anchors take the form:
919 *
920 * <pre>
921 * &(anchor name)
922 * </pre>
923 *
924 * @see http://www.yaml.org/spec/1.1/#id863390
925 */
926 private void fetchAnchor() {
927 // ANCHOR could start a simple key.
928 savePossibleSimpleKey();
929
930 // No simple keys after ANCHOR.
931 this.allowSimpleKey = false;
932
933 // Scan and add ANCHOR.
934 Token tok = scanAnchor(true);
935 this.tokens.add(tok);
936 }
937
938 /**
939 * Fetch a tag. Tags take a complex form.
940 *
941 * @see http://www.yaml.org/spec/1.1/#id861700
942 */
943 private void fetchTag() {
944 // TAG could start a simple key.
945 savePossibleSimpleKey();
946
947 // No simple keys after TAG.
948 this.allowSimpleKey = false;
949
950 // Scan and add TAG.
951 Token tok = scanTag();
952 this.tokens.add(tok);
953 }
954
955 /**
956 * Fetch a literal scalar, denoted with a vertical-bar. This is the type
957 * best used for source code and other content, such as binary data, which
958 * must be included verbatim.
959 *
960 * @see http://www.yaml.org/spec/1.1/#id863975
961 */
962 private void fetchLiteral() {
963 fetchBlockScalar('|');
964 }
965
966 /**
967 * Fetch a folded scalar, denoted with a greater-than sign. This is the type
968 * best used for long content, such as the text of a chapter or description.
969 *
970 * @see http://www.yaml.org/spec/1.1/#id863975
971 */
972 private void fetchFolded() {
973 fetchBlockScalar('>');
974 }
975
976 /**
977 * Fetch a block scalar (literal or folded).
978 *
979 * @see http://www.yaml.org/spec/1.1/#id863975
980 *
981 * @param style
982 */
983 private void fetchBlockScalar(char style) {
984 // A simple key may follow a block scalar.
985 this.allowSimpleKey = true;
986
987 // Reset possible simple key on the current level.
988 removePossibleSimpleKey();
989
990 // Scan and add SCALAR.
991 Token tok = scanBlockScalar(style);
992 this.tokens.add(tok);
993 }
994
995 /**
996 * Fetch a single-quoted (') scalar.
997 */
998 private void fetchSingle() {
999 fetchFlowScalar('\'');
1000 }
1001
1002 /**
1003 * Fetch a double-quoted (") scalar.
1004 */
1005 private void fetchDouble() {
1006 fetchFlowScalar('"');
1007 }
1008
1009 /**
1010 * Fetch a flow scalar (single- or double-quoted).
1011 *
1012 * @see http://www.yaml.org/spec/1.1/#id863975
1013 *
1014 * @param style
1015 */
1016 private void fetchFlowScalar(char style) {
1017 // A flow scalar could be a simple key.
1018 savePossibleSimpleKey();
1019
1020 // No simple keys after flow scalars.
1021 this.allowSimpleKey = false;
1022
1023 // Scan and add SCALAR.
1024 Token tok = scanFlowScalar(style);
1025 this.tokens.add(tok);
1026 }
1027
1028 /**
1029 * Fetch a plain scalar.
1030 */
1031 private void fetchPlain() {
1032 // A plain scalar could be a simple key.
1033 savePossibleSimpleKey();
1034
1035 // No simple keys after plain scalars. But note that `scan_plain` will
1036 // change this flag if the scan is finished at the beginning of the
1037 // line.
1038 this.allowSimpleKey = false;
1039
1040 // Scan and add SCALAR. May change `allow_simple_key`.
1041 Token tok = scanPlain();
1042 this.tokens.add(tok);
1043 }
1044
1045 // Checkers.
1046 /**
1047 * Returns true if the next thing on the reader is a directive, given that
1048 * the leading '%' has already been checked.
1049 *
1050 * @see http://www.yaml.org/spec/1.1/#id864824
1051 */
1052 private boolean checkDirective() {
1053 // DIRECTIVE: ^ '%' ...
1054 // The '%' indicator is already checked.
1055 return reader.getColumn() == 0;
1056 }
1057
1058 /**
1059 * Returns true if the next thing on the reader is a document-start ("---").
1060 * A document-start is always followed immediately by a new line.
1061 */
1062 private boolean checkDocumentStart() {
1063 // DOCUMENT-START: ^ '---' (' '|'\n')
1064 if (reader.getColumn() == 0) {
1065 if ("---".equals(reader.prefix(3)) && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) {
1066 return true;
1067 }
1068 }
1069 return false;
1070 }
1071
1072 /**
1073 * Returns true if the next thing on the reader is a document-end ("..."). A
1074 * document-end is always followed immediately by a new line.
1075 */
1076 private boolean checkDocumentEnd() {
1077 // DOCUMENT-END: ^ '...' (' '|'\n')
1078 if (reader.getColumn() == 0) {
1079 if ("...".equals(reader.prefix(3)) && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) {
1080 return true;
1081 }
1082 }
1083 return false;
1084 }
1085
1086 /**
1087 * Returns true if the next thing on the reader is a block token.
1088 */
1089 private boolean checkBlockEntry() {
1090 // BLOCK-ENTRY: '-' (' '|'\n')
1091 return Constant.NULL_BL_T_LINEBR.has(reader.peek(1));
1092 }
1093
1094 /**
1095 * Returns true if the next thing on the reader is a key token.
1096 */
1097 private boolean checkKey() {
1098 // KEY(flow context): '?'
1099 if (this.flowLevel != 0) {
1100 return true;
1101 } else {
1102 // KEY(block context): '?' (' '|'\n')
1103 return Constant.NULL_BL_T_LINEBR.has(reader.peek(1));
1104 }
1105 }
1106
1107 /**
1108 * Returns true if the next thing on the reader is a value token.
1109 */
1110 private boolean checkValue() {
1111 // VALUE(flow context): ':'
1112 if (flowLevel != 0) {
1113 return true;
1114 } else {
1115 // VALUE(block context): ':' (' '|'\n')
1116 return Constant.NULL_BL_T_LINEBR.has(reader.peek(1));
1117 }
1118 }
1119
1120 /**
1121 * Returns true if the next thing on the reader is a plain token.
1122 */
1123 private boolean checkPlain() {
1124 /**
1125 * <pre>
1126 * A plain scalar may start with any non-space character except:
1127 * '-', '?', ':', ',', '[', ']', '{', '}',
1128 * '#', '&', '*', '!', '|', '>', '\'', '\"',
1129 * '%', '@', '`'.
1130 *
1131 * It may also start with
1132 * '-', '?', ':'
1133 * if it is followed by a non-space character.
1134 *
1135 * Note that we limit the last rule to the block context (except the
1136 * '-' character) because we want the flow context to be space
1137 * independent.
1138 * </pre>
1139 */
1140 char ch = reader.peek();
1141 // If the next char is NOT one of the forbidden chars above or
1142 // whitespace, then this is the start of a plain scalar.
1143 return Constant.NULL_BL_T_LINEBR.hasNo(ch, "-?:,[]{}#&*!|>\'\"%@`")
1144 || (Constant.NULL_BL_T_LINEBR.hasNo(reader.peek(1)) && (ch == '-' || (this.flowLevel == 0 && "?:"
1145 .indexOf(ch) != -1)));
1146 }
1147
1148 // Scanners.
1149
1150 /**
1151 * <pre>
1152 * We ignore spaces, line breaks and comments.
1153 * If we find a line break in the block context, we set the flag
1154 * `allow_simple_key` on.
1155 * The byte order mark is stripped if it's the first character in the
1156 * stream. We do not yet support BOM inside the stream as the
1157 * specification requires. Any such mark will be considered as a part
1158 * of the document.
1159 * TODO: We need to make tab handling rules more sane. A good rule is
1160 * Tabs cannot precede tokens
1161 * BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
1162 * KEY(block), VALUE(block), BLOCK-ENTRY
1163 * So the checking code is
1164 * if <TAB>:
1165 * self.allow_simple_keys = False
1166 * We also need to add the check for `allow_simple_keys == True` to
1167 * `unwind_indent` before issuing BLOCK-END.
1168 * Scanners for block, flow, and plain scalars need to be modified.
1169 * </pre>
1170 */
1171 private void scanToNextToken() {
1172 // If there is a byte order mark (BOM) at the beginning of the stream,
1173 // forward past it.
1174 if (reader.getIndex() == 0 && reader.peek() == '\uFEFF') {
1175 reader.forward();
1176 }
1177 boolean found = false;
1178 while (!found) {
1179 int ff = 0;
1180 // Peek ahead until we find the first non-space character, then
1181 // move forward directly to that character.
1182 while (reader.peek(ff) == ' ') {
1183 ff++;
1184 }
1185 if (ff > 0) {
1186 reader.forward(ff);
1187 }
1188 // If the character we have skipped forward to is a comment (#),
1189 // then peek ahead until we find the next end of line. YAML
1190 // comments are from a # to the next new-line. We then forward
1191 // past the comment.
1192 if (reader.peek() == '#') {
1193 ff = 0;
1194 while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(ff))) {
1195 ff++;
1196 }
1197 if (ff > 0) {
1198 reader.forward(ff);
1199 }
1200 }
1201 // If we scanned a line break, then (depending on flow level),
1202 // simple keys may be allowed.
1203 if (scanLineBreak().length() != 0) {// found a line-break
1204 if (this.flowLevel == 0) {
1205 // Simple keys are allowed at flow-level 0 after a line
1206 // break
1207 this.allowSimpleKey = true;
1208 }
1209 } else {
1210 found = true;
1211 }
1212 }
1213 }
1214
1215 @SuppressWarnings({ "unchecked", "rawtypes" })
1216 private Token scanDirective() {
1217 // See the specification for details.
1218 Mark startMark = reader.getMark();
1219 Mark endMark;
1220 reader.forward();
1221 String name = scanDirectiveName(startMark);
1222 List<?> value = null;
1223 if ("YAML".equals(name)) {
1224 value = scanYamlDirectiveValue(startMark);
1225 endMark = reader.getMark();
1226 } else if ("TAG".equals(name)) {
1227 value = scanTagDirectiveValue(startMark);
1228 endMark = reader.getMark();
1229 } else {
1230 endMark = reader.getMark();
1231 int ff = 0;
1232 while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(ff))) {
1233 ff++;
1234 }
1235 if (ff > 0) {
1236 reader.forward(ff);
1237 }
1238 }
1239 scanDirectiveIgnoredLine(startMark);
1240 return new DirectiveToken(name, value, startMark, endMark);
1241 }
1242
1243 /**
1244 * Scan a directive name. Directive names are a series of non-space
1245 * characters.
1246 *
1247 * @see http://www.yaml.org/spec/1.1/#id895217
1248 */
1249 private String scanDirectiveName(Mark startMark) {
1250 // See the specification for details.
1251 int length = 0;
1252 // A Directive-name is a sequence of alphanumeric characters
1253 // (a-z,A-Z,0-9). We scan until we find something that isn't.
1254 // FIXME this disagrees with the specification.
1255 char ch = reader.peek(length);
1256 while (Constant.ALPHA.has(ch)) {
1257 length++;
1258 ch = reader.peek(length);
1259 }
1260 // If the name would be empty, an error occurs.
1261 if (length == 0) {
1262 throw new ScannerException("while scanning a directive", startMark,
1263 "expected alphabetic or numeric character, but found " + ch + "(" + ((int) ch)
1264 + ")", reader.getMark());
1265 }
1266 String value = reader.prefixForward(length);
1267 ch = reader.peek();
1268 if (Constant.NULL_BL_LINEBR.hasNo(ch)) {
1269 throw new ScannerException("while scanning a directive", startMark,
1270 "expected alphabetic or numeric character, but found " + ch + "(" + ((int) ch)
1271 + ")", reader.getMark());
1272 }
1273 return value;
1274 }
1275
1276 private List<Integer> scanYamlDirectiveValue(Mark startMark) {
1277 // See the specification for details.
1278 while (reader.peek() == ' ') {
1279 reader.forward();
1280 }
1281 Integer major = scanYamlDirectiveNumber(startMark);
1282 if (reader.peek() != '.') {
1283 throw new ScannerException("while scanning a directive", startMark,
1284 "expected a digit or '.', but found " + reader.peek() + "("
1285 + ((int) reader.peek()) + ")", reader.getMark());
1286 }
1287 reader.forward();
1288 Integer minor = scanYamlDirectiveNumber(startMark);
1289 if (Constant.NULL_BL_LINEBR.hasNo(reader.peek())) {
1290 throw new ScannerException("while scanning a directive", startMark,
1291 "expected a digit or ' ', but found " + reader.peek() + "("
1292 + ((int) reader.peek()) + ")", reader.getMark());
1293 }
1294 List<Integer> result = new ArrayList<Integer>(2);
1295 result.add(major);
1296 result.add(minor);
1297 return result;
1298 }
1299
1300 /**
1301 * Read a %YAML directive number: this is either the major or the minor
1302 * part. Stop reading at a non-digit character (usually either '.' or '\n').
1303 *
1304 * @see http://www.yaml.org/spec/1.1/#id895631
1305 * @see http://www.yaml.org/spec/1.1/#ns-dec-digit
1306 */
1307 private Integer scanYamlDirectiveNumber(Mark startMark) {
1308 // See the specification for details.
1309 char ch = reader.peek();
1310 if (!Character.isDigit(ch)) {
1311 throw new ScannerException("while scanning a directive", startMark,
1312 "expected a digit, but found " + ch + "(" + ((int) ch) + ")", reader.getMark());
1313 }
1314 int length = 0;
1315 while (Character.isDigit(reader.peek(length))) {
1316 length++;
1317 }
1318 Integer value = Integer.parseInt(reader.prefixForward(length));
1319 return value;
1320 }
1321
1322 /**
1323 * <p>
1324 * Read a %TAG directive value:
1325 *
1326 * <pre>
1327 * s-ignored-space+ c-tag-handle s-ignored-space+ ns-tag-prefix s-l-comments
1328 * </pre>
1329 *
1330 * </p>
1331 *
1332 * @see http://www.yaml.org/spec/1.1/#id896044
1333 */
1334 private List<String> scanTagDirectiveValue(Mark startMark) {
1335 // See the specification for details.
1336 while (reader.peek() == ' ') {
1337 reader.forward();
1338 }
1339 String handle = scanTagDirectiveHandle(startMark);
1340 while (reader.peek() == ' ') {
1341 reader.forward();
1342 }
1343 String prefix = scanTagDirectivePrefix(startMark);
1344 List<String> result = new ArrayList<String>(2);
1345 result.add(handle);
1346 result.add(prefix);
1347 return result;
1348 }
1349
1350 /**
1351 * Scan a %TAG directive's handle. This is YAML's c-tag-handle.
1352 *
1353 * @see http://www.yaml.org/spec/1.1/#id896876
1354 * @param startMark
1355 * @return
1356 */
1357 private String scanTagDirectiveHandle(Mark startMark) {
1358 // See the specification for details.
1359 String value = scanTagHandle("directive", startMark);
1360 char ch = reader.peek();
1361 if (ch != ' ') {
1362 throw new ScannerException("while scanning a directive", startMark,
1363 "expected ' ', but found " + reader.peek() + "(" + ch + ")", reader.getMark());
1364 }
1365 return value;
1366 }
1367
1368 /**
1369 * Scan a %TAG directive's prefix. This is YAML's ns-tag-prefix.
1370 *
1371 * @see http://www.yaml.org/spec/1.1/#ns-tag-prefix
1372 */
1373 private String scanTagDirectivePrefix(Mark startMark) {
1374 // See the specification for details.
1375 String value = scanTagUri("directive", startMark);
1376 if (Constant.NULL_BL_LINEBR.hasNo(reader.peek())) {
1377 throw new ScannerException("while scanning a directive", startMark,
1378 "expected ' ', but found " + reader.peek() + "(" + ((int) reader.peek()) + ")",
1379 reader.getMark());
1380 }
1381 return value;
1382 }
1383
1384 private String scanDirectiveIgnoredLine(Mark startMark) {
1385 // See the specification for details.
1386 int ff = 0;
1387 while (reader.peek(ff) == ' ') {
1388 ff++;
1389 }
1390 if (ff > 0) {
1391 reader.forward(ff);
1392 }
1393 if (reader.peek() == '#') {
1394 ff = 0;
1395 while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(ff))) {
1396 ff++;
1397 }
1398 reader.forward(ff);
1399 }
1400 char ch = reader.peek();
1401 String lineBreak = scanLineBreak();
1402 if (lineBreak.length() == 0 && ch != '\0') {
1403 throw new ScannerException("while scanning a directive", startMark,
1404 "expected a comment or a line break, but found " + ch + "(" + ((int) ch) + ")",
1405 reader.getMark());
1406 }
1407 return lineBreak;
1408 }
1409
1410 /**
1411 * <pre>
1412 * The specification does not restrict characters for anchors and
1413 * aliases. This may lead to problems, for instance, the document:
1414 * [ *alias, value ]
1415 * can be interpreted in two ways, as
1416 * [ "value" ]
1417 * and
1418 * [ *alias , "value" ]
1419 * Therefore we restrict aliases to numbers and ASCII letters.
1420 * </pre>
1421 */
1422 private Token scanAnchor(boolean isAnchor) {
1423 Mark startMark = reader.getMark();
1424 char indicator = reader.peek();
1425 String name = indicator == '*' ? "alias" : "anchor";
1426 reader.forward();
1427 int length = 0;
1428 char ch = reader.peek(length);
1429 while (Constant.ALPHA.has(ch)) {
1430 length++;
1431 ch = reader.peek(length);
1432 }
1433 if (length == 0) {
1434 throw new ScannerException("while scanning an " + name, startMark,
1435 "expected alphabetic or numeric character, but found but found " + ch,
1436 reader.getMark());
1437 }
1438 String value = reader.prefixForward(length);
1439 ch = reader.peek();
1440 if (Constant.NULL_BL_T_LINEBR.hasNo(ch, "?:,]}%@`")) {
1441 throw new ScannerException("while scanning an " + name, startMark,
1442 "expected alphabetic or numeric character, but found " + ch + "("
1443 + ((int) reader.peek()) + ")", reader.getMark());
1444 }
1445 Mark endMark = reader.getMark();
1446 Token tok;
1447 if (isAnchor) {
1448 tok = new AnchorToken(value, startMark, endMark);
1449 } else {
1450 tok = new AliasToken(value, startMark, endMark);
1451 }
1452 return tok;
1453 }
1454
1455 /**
1456 * <p>
1457 * Scan a Tag property. A Tag property may be specified in one of three
1458 * ways: c-verbatim-tag, c-ns-shorthand-tag, or c-ns-non-specific-tag
1459 * </p>
1460 *
1461 * <p>
1462 * c-verbatim-tag takes the form !<ns-uri-char+> and must be delivered
1463 * verbatim (as-is) to the application. In particular, verbatim tags are not
1464 * subject to tag resolution.
1465 * </p>
1466 *
1467 * <p>
1468 * c-ns-shorthand-tag is a valid tag handle followed by a non-empty suffix.
1469 * If the tag handle is a c-primary-tag-handle ('!') then the suffix must
1470 * have all exclamation marks properly URI-escaped (%21); otherwise, the
1471 * string will look like a named tag handle: !foo!bar would be interpreted
1472 * as (handle="!foo!", suffix="bar").
1473 * </p>
1474 *
1475 * <p>
1476 * c-ns-non-specific-tag is always a lone '!'; this is only useful for plain
1477 * scalars, where its specification means that the scalar MUST be resolved
1478 * to have type tag:yaml.org,2002:str.
1479 * </p>
1480 *
1481 * TODO SnakeYaml incorrectly ignores c-ns-non-specific-tag right now.
1482 *
1483 * @see http://www.yaml.org/spec/1.1/#id900262
1484 *
1485 * TODO Note that this method does not enforce rules about local versus
1486 * global tags!
1487 */
1488 private Token scanTag() {
1489 // See the specification for details.
1490 Mark startMark = reader.getMark();
1491 // Determine the type of tag property based on the first character
1492 // encountered
1493 char ch = reader.peek(1);
1494 String handle = null;
1495 String suffix = null;
1496 // Verbatim tag! (c-verbatim-tag)
1497 if (ch == '<') {
1498 // Skip the exclamation mark and >, then read the tag suffix (as
1499 // a URI).
1500 reader.forward(2);
1501 suffix = scanTagUri("tag", startMark);
1502 if (reader.peek() != '>') {
1503 // If there are any characters between the end of the tag-suffix
1504 // URI and the closing >, then an error has occurred.
1505 throw new ScannerException("while scanning a tag", startMark,
1506 "expected '>', but found '" + reader.peek() + "' (" + ((int) reader.peek())
1507 + ")", reader.getMark());
1508 }
1509 reader.forward();
1510 } else if (Constant.NULL_BL_T_LINEBR.has(ch)) {
1511 // A NUL, blank, tab, or line-break means that this was a
1512 // c-ns-non-specific tag.
1513 suffix = "!";
1514 reader.forward();
1515 } else {
1516 // Any other character implies c-ns-shorthand-tag type.
1517
1518 // Look ahead in the stream to determine whether this tag property
1519 // is of the form !foo or !foo!bar.
1520 int length = 1;
1521 boolean useHandle = false;
1522 while (Constant.NULL_BL_LINEBR.hasNo(ch)) {
1523 if (ch == '!') {
1524 useHandle = true;
1525 break;
1526 }
1527 length++;
1528 ch = reader.peek(length);
1529 }
1530 handle = "!";
1531 // If we need to use a handle, scan it in; otherwise, the handle is
1532 // presumed to be '!'.
1533 if (useHandle) {
1534 handle = scanTagHandle("tag", startMark);
1535 } else {
1536 handle = "!";
1537 reader.forward();
1538 }
1539 suffix = scanTagUri("tag", startMark);
1540 }
1541 ch = reader.peek();
1542 // Check that the next character is allowed to follow a tag-property;
1543 // if it is not, raise the error.
1544 if (Constant.NULL_BL_LINEBR.hasNo(ch)) {
1545 throw new ScannerException("while scanning a tag", startMark,
1546 "expected ' ', but found '" + ch + "' (" + ((int) ch) + ")", reader.getMark());
1547 }
1548 TagTuple value = new TagTuple(handle, suffix);
1549 Mark endMark = reader.getMark();
1550 return new TagToken(value, startMark, endMark);
1551 }
1552
1553 private Token scanBlockScalar(char style) {
1554 // See the specification for details.
1555 boolean folded;
1556 // Depending on the given style, we determine whether the scalar is
1557 // folded ('>') or literal ('|')
1558 if (style == '>') {
1559 folded = true;
1560 } else {
1561 folded = false;
1562 }
1563 StringBuilder chunks = new StringBuilder();
1564 Mark startMark = reader.getMark();
1565 // Scan the header.
1566 reader.forward();
1567 Chomping chompi = scanBlockScalarIndicators(startMark);
1568 int increment = chompi.getIncrement();
1569 scanBlockScalarIgnoredLine(startMark);
1570
1571 // Determine the indentation level and go to the first non-empty line.
1572 int minIndent = this.indent + 1;
1573 if (minIndent < 1) {
1574 minIndent = 1;
1575 }
1576 String breaks = null;
1577 int maxIndent = 0;
1578 int indent = 0;
1579 Mark endMark;
1580 if (increment == -1) {
1581 Object[] brme = scanBlockScalarIndentation();
1582 breaks = (String) brme[0];
1583 maxIndent = ((Integer) brme[1]).intValue();
1584 endMark = (Mark) brme[2];
1585 indent = Math.max(minIndent, maxIndent);
1586 } else {
1587 indent = minIndent + increment - 1;
1588 Object[] brme = scanBlockScalarBreaks(indent);
1589 breaks = (String) brme[0];
1590 endMark = (Mark) brme[1];
1591 }
1592
1593 String lineBreak = "";
1594
1595 // Scan the inner part of the block scalar.
1596 while (this.reader.getColumn() == indent && reader.peek() != '\0') {
1597 chunks.append(breaks);
1598 boolean leadingNonSpace = " \t".indexOf(reader.peek()) == -1;
1599 int length = 0;
1600 while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(length))) {
1601 length++;
1602 }
1603 chunks.append(reader.prefixForward(length));
1604 lineBreak = scanLineBreak();
1605 Object[] brme = scanBlockScalarBreaks(indent);
1606 breaks = (String) brme[0];
1607 endMark = (Mark) brme[1];
1608 if (this.reader.getColumn() == indent && reader.peek() != '\0') {
1609
1610 // Unfortunately, folding rules are ambiguous.
1611 //
1612 // This is the folding according to the specification:
1613 if (folded && "\n".equals(lineBreak) && leadingNonSpace
1614 && " \t".indexOf(reader.peek()) == -1) {
1615 if (breaks.length() == 0) {
1616 chunks.append(" ");
1617 }
1618 } else {
1619 chunks.append(lineBreak);
1620 }
1621 // Clark Evans's interpretation (also in the spec examples) not
1622 // imported from PyYAML
1623 } else {
1624 break;
1625 }
1626 }
1627 // Chomp the tail.
1628 if (chompi.chompTailIsNotFalse()) {
1629 chunks.append(lineBreak);
1630 }
1631 if (chompi.chompTailIsTrue()) {
1632 chunks.append(breaks);
1633 }
1634 // We are done.
1635 return new ScalarToken(chunks.toString(), false, startMark, endMark, style);
1636 }
1637
1638 /**
1639 * Scan a block scalar indicator. The block scalar indicator includes two
1640 * optional components, which may appear in either order.
1641 *
1642 * A block indentation indicator is a non-zero digit describing the
1643 * indentation level of the block scalar to follow. This indentation is an
1644 * additional number of spaces relative to the current indentation level.
1645 *
1646 * A block chomping indicator is a + or -, selecting the chomping mode away
1647 * from the default (clip) to either -(strip) or +(keep).
1648 *
1649 * @see http://www.yaml.org/spec/1.1/#id868988
1650 * @see http://www.yaml.org/spec/1.1/#id927035
1651 * @see http://www.yaml.org/spec/1.1/#id927557
1652 */
1653 private Chomping scanBlockScalarIndicators(Mark startMark) {
1654 // See the specification for details.
1655 Boolean chomping = null;
1656 int increment = -1;
1657 char ch = reader.peek();
1658 if (ch == '-' || ch == '+') {
1659 if (ch == '+') {
1660 chomping = Boolean.TRUE;
1661 } else {
1662 chomping = Boolean.FALSE;
1663 }
1664 reader.forward();
1665 ch = reader.peek();
1666 if (Character.isDigit(ch)) {
1667 increment = Integer.parseInt(String.valueOf(ch));
1668 if (increment == 0) {
1669 throw new ScannerException("while scanning a block scalar", startMark,
1670 "expected indentation indicator in the range 1-9, but found 0",
1671 reader.getMark());
1672 }
1673 reader.forward();
1674 }
1675 } else if (Character.isDigit(ch)) {
1676 increment = Integer.parseInt(String.valueOf(ch));
1677 if (increment == 0) {
1678 throw new ScannerException("while scanning a block scalar", startMark,
1679 "expected indentation indicator in the range 1-9, but found 0",
1680 reader.getMark());
1681 }
1682 reader.forward();
1683 ch = reader.peek();
1684 if (ch == '-' || ch == '+') {
1685 if (ch == '+') {
1686 chomping = Boolean.TRUE;
1687 } else {
1688 chomping = Boolean.FALSE;
1689 }
1690 reader.forward();
1691 }
1692 }
1693 ch = reader.peek();
1694 if (Constant.NULL_BL_LINEBR.hasNo(ch)) {
1695 throw new ScannerException("while scanning a block scalar", startMark,
1696 "expected chomping or indentation indicators, but found " + ch,
1697 reader.getMark());
1698 }
1699 return new Chomping(chomping, increment);
1700 }
1701
1702 /**
1703 * Scan to the end of the line after a block scalar has been scanned; the
1704 * only things that are permitted at this time are comments and spaces.
1705 */
1706 private String scanBlockScalarIgnoredLine(Mark startMark) {
1707 // See the specification for details.
1708 int ff = 0;
1709 // Forward past any number of trailing spaces
1710 while (reader.peek(ff) == ' ') {
1711 ff++;
1712 }
1713 if (ff > 0) {
1714 reader.forward(ff);
1715 }
1716 // If a comment occurs, scan to just before the end of line.
1717 if (reader.peek() == '#') {
1718 ff = 0;
1719 while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(ff))) {
1720 ff++;
1721 }
1722 if (ff > 0) {
1723 reader.forward(ff);
1724 }
1725 }
1726 // If the next character is not a null or line break, an error has
1727 // occurred.
1728 char ch = reader.peek();
1729 String lineBreak = scanLineBreak();
1730 if (lineBreak.length() == 0 && ch != '\0') {
1731 throw new ScannerException("while scanning a block scalar", startMark,
1732 "expected a comment or a line break, but found " + ch, reader.getMark());
1733 }
1734 return lineBreak;
1735 }
1736
1737 /**
1738 * Scans for the indentation of a block scalar implicitly. This mechanism is
1739 * used only if the block did not explicitly state an indentation to be
1740 * used.
1741 *
1742 * @see http://www.yaml.org/spec/1.1/#id927035
1743 */
1744 private Object[] scanBlockScalarIndentation() {
1745 // See the specification for details.
1746 StringBuilder chunks = new StringBuilder();
1747 int maxIndent = 0;
1748 Mark endMark = reader.getMark();
1749 // Look ahead some number of lines until the first non-blank character
1750 // occurs; the determined indentation will be the maximum number of
1751 // leading spaces on any of these lines.
1752 while (Constant.LINEBR.has(reader.peek(), " \r")) {
1753 if (reader.peek() != ' ') {
1754 // If the character isn't a space, it must be some kind of
1755 // line-break; scan the line break and track it.
1756 chunks.append(scanLineBreak());
1757 endMark = reader.getMark();
1758 } else {
1759 // If the character is a space, move forward to the next
1760 // character; if we surpass our previous maximum for indent
1761 // level, update that too.
1762 reader.forward();
1763 if (this.reader.getColumn() > maxIndent) {
1764 maxIndent = reader.getColumn();
1765 }
1766 }
1767 }
1768 // Pass several results back together.
1769 return new Object[] { chunks.toString(), maxIndent, endMark };
1770 }
1771
1772 private Object[] scanBlockScalarBreaks(int indent) {
1773 // See the specification for details.
1774 StringBuilder chunks = new StringBuilder();
1775 Mark endMark = reader.getMark();
1776 int ff = 0;
1777 int col = this.reader.getColumn();
1778 // Scan for up to the expected indentation-level of spaces, then move
1779 // forward past that amount.
1780 while (col < indent && reader.peek(ff) == ' ') {
1781 ff++;
1782 col++;
1783 }
1784 if (ff > 0) {
1785 reader.forward(ff);
1786 }
1787 // Consume one or more line breaks followed by any amount of spaces,
1788 // until we find something that isn't a line-break.
1789 String lineBreak = null;
1790 while ((lineBreak = scanLineBreak()).length() != 0) {
1791 chunks.append(lineBreak);
1792 endMark = reader.getMark();
1793 // Scan past up to (indent) spaces on the next line, then forward
1794 // past them.
1795 ff = 0;
1796 col = this.reader.getColumn();
1797 while (col < indent && reader.peek(ff) == ' ') {
1798 ff++;
1799 col++;
1800 }
1801 if (ff > 0) {
1802 reader.forward(ff);
1803 }
1804 }
1805 // Return both the assembled intervening string and the end-mark.
1806 return new Object[] { chunks.toString(), endMark };
1807 }
1808
1809 /**
1810 * Scan a flow-style scalar. Flow scalars are presented in one of two forms;
1811 * first, a flow scalar may be a double-quoted string; second, a flow scalar
1812 * may be a single-quoted string.
1813 *
1814 * @see http://www.yaml.org/spec/1.1/#flow style/syntax
1815 *
1816 * <pre>
1817 * See the specification for details.
1818 * Note that we loose indentation rules for quoted scalars. Quoted
1819 * scalars don't need to adhere indentation because " and ' clearly
1820 * mark the beginning and the end of them. Therefore we are less
1821 * restrictive then the specification requires. We only need to check
1822 * that document separators are not included in scalars.
1823 * </pre>
1824 */
1825 private Token scanFlowScalar(char style) {
1826 boolean _double;
1827 // The style will be either single- or double-quoted; we determine this
1828 // by the first character in the entry (supplied)
1829 if (style == '"') {
1830 _double = true;
1831 } else {
1832 _double = false;
1833 }
1834 StringBuilder chunks = new StringBuilder();
1835 Mark startMark = reader.getMark();
1836 char quote = reader.peek();
1837 reader.forward();
1838 chunks.append(scanFlowScalarNonSpaces(_double, startMark));
1839 while (reader.peek() != quote) {
1840 chunks.append(scanFlowScalarSpaces(startMark));
1841 chunks.append(scanFlowScalarNonSpaces(_double, startMark));
1842 }
1843 reader.forward();
1844 Mark endMark = reader.getMark();
1845 return new ScalarToken(chunks.toString(), false, startMark, endMark, style);
1846 }
1847
1848 /**
1849 * Scan some number of flow-scalar non-space characters.
1850 */
1851 private String scanFlowScalarNonSpaces(boolean doubleQuoted, Mark startMark) {
1852 // See the specification for details.
1853 StringBuilder chunks = new StringBuilder();
1854 while (true) {
1855 // Scan through any number of characters which are not: NUL, blank,
1856 // tabs, line breaks, single-quotes, double-quotes, or backslashes.
1857 int length = 0;
1858 while (Constant.NULL_BL_T_LINEBR.hasNo(reader.peek(length), "\'\"\\")) {
1859 length++;
1860 }
1861 if (length != 0) {
1862 chunks.append(reader.prefixForward(length));
1863 }
1864 // Depending on our quoting-type, the characters ', " and \ have
1865 // differing meanings.
1866 char ch = reader.peek();
1867 if (!doubleQuoted && ch == '\'' && reader.peek(1) == '\'') {
1868 chunks.append("'");
1869 reader.forward(2);
1870 } else if ((doubleQuoted && ch == '\'') || (!doubleQuoted && "\"\\".indexOf(ch) != -1)) {
1871 chunks.append(ch);
1872 reader.forward();
1873 } else if (doubleQuoted && ch == '\\') {
1874 reader.forward();
1875 ch = reader.peek();
1876 if (ESCAPE_REPLACEMENTS.containsKey(Character.valueOf(ch))) {
1877 // The character is one of the single-replacement
1878 // types; these are replaced with a literal character
1879 // from the mapping.
1880 chunks.append(ESCAPE_REPLACEMENTS.get(Character.valueOf(ch)));
1881 reader.forward();
1882 } else if (ESCAPE_CODES.containsKey(Character.valueOf(ch))) {
1883 // The character is a multi-digit escape sequence, with
1884 // length defined by the value in the ESCAPE_CODES map.
1885 length = (ESCAPE_CODES.get(Character.valueOf(ch))).intValue();
1886 reader.forward();
1887 String hex = reader.prefix(length);
1888 if (NOT_HEXA.matcher(hex).find()) {
1889 throw new ScannerException("while scanning a double-quoted scalar",
1890 startMark, "expected escape sequence of " + length
1891 + " hexadecimal numbers, but found: " + hex,
1892 reader.getMark());
1893 }
1894 int decimal = Integer.parseInt(hex, 16);
1895 String unicode = new String(Character.toChars(decimal));
1896 chunks.append(unicode);
1897 reader.forward(length);
1898 } else if (scanLineBreak().length() != 0) {
1899 chunks.append(scanFlowScalarBreaks(startMark));
1900 } else {
1901 throw new ScannerException("while scanning a double-quoted scalar", startMark,
1902 "found unknown escape character " + ch + "(" + ((int) ch) + ")",
1903 reader.getMark());
1904 }
1905 } else {
1906 return chunks.toString();
1907 }
1908 }
1909 }
1910
1911 private String scanFlowScalarSpaces(Mark startMark) {
1912 // See the specification for details.
1913 StringBuilder chunks = new StringBuilder();
1914 int length = 0;
1915 // Scan through any number of whitespace (space, tab) characters,
1916 // consuming them.
1917 while (" \t".indexOf(reader.peek(length)) != -1) {
1918 length++;
1919 }
1920 String whitespaces = reader.prefixForward(length);
1921 char ch = reader.peek();
1922 if (ch == '\0') {
1923 // A flow scalar cannot end with an end-of-stream
1924 throw new ScannerException("while scanning a quoted scalar", startMark,
1925 "found unexpected end of stream", reader.getMark());
1926 }
1927 // If we encounter a line break, scan it into our assembled string...
1928 String lineBreak = scanLineBreak();
1929 if (lineBreak.length() != 0) {
1930 String breaks = scanFlowScalarBreaks(startMark);
1931 if (!"\n".equals(lineBreak)) {
1932 chunks.append(lineBreak);
1933 } else if (breaks.length() == 0) {
1934 chunks.append(" ");
1935 }
1936 chunks.append(breaks);
1937 } else {
1938 chunks.append(whitespaces);
1939 }
1940 return chunks.toString();
1941 }
1942
1943 private String scanFlowScalarBreaks(Mark startMark) {
1944 // See the specification for details.
1945 StringBuilder chunks = new StringBuilder();
1946 while (true) {
1947 // Instead of checking indentation, we check for document
1948 // separators.
1949 String prefix = reader.prefix(3);
1950 if (("---".equals(prefix) || "...".equals(prefix))
1951 && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) {
1952 throw new ScannerException("while scanning a quoted scalar", startMark,
1953 "found unexpected document separator", reader.getMark());
1954 }
1955 // Scan past any number of spaces and tabs, ignoring them
1956 while (" \t".indexOf(reader.peek()) != -1) {
1957 reader.forward();
1958 }
1959 // If we stopped at a line break, add that; otherwise, return the
1960 // assembled set of scalar breaks.
1961 String lineBreak = scanLineBreak();
1962 if (lineBreak.length() != 0) {
1963 chunks.append(lineBreak);
1964 } else {
1965 return chunks.toString();
1966 }
1967 }
1968 }
1969
1970 /**
1971 * Scan a plain scalar.
1972 *
1973 * <pre>
1974 * See the specification for details.
1975 * We add an additional restriction for the flow context:
1976 * plain scalars in the flow context cannot contain ',', ':' and '?'.
1977 * We also keep track of the `allow_simple_key` flag here.
1978 * Indentation rules are loosed for the flow context.
1979 * </pre>
1980 */
1981 private Token scanPlain() {
1982 StringBuilder chunks = new StringBuilder();
1983 Mark startMark = reader.getMark();
1984 Mark endMark = startMark;
1985 int indent = this.indent + 1;
1986 String spaces = "";
1987 while (true) {
1988 char ch;
1989 int length = 0;
1990 // A comment indicates the end of the scalar.
1991 if (reader.peek() == '#') {
1992 break;
1993 }
1994 while (true) {
1995 ch = reader.peek(length);
1996 if (Constant.NULL_BL_T_LINEBR.has(ch)
1997 || (this.flowLevel == 0 && ch == ':' && Constant.NULL_BL_T_LINEBR
1998 .has(reader.peek(length + 1)))
1999 || (this.flowLevel != 0 && ",:?[]{}".indexOf(ch) != -1)) {
2000 break;
2001 }
2002 length++;
2003 }
2004 // It's not clear what we should do with ':' in the flow context.
2005 if (this.flowLevel != 0 && ch == ':'
2006 && Constant.NULL_BL_T_LINEBR.hasNo(reader.peek(length + 1), ",[]{}")) {
2007 reader.forward(length);
2008 throw new ScannerException("while scanning a plain scalar", startMark,
2009 "found unexpected ':'", reader.getMark(),
2010 "Please check http://pyyaml.org/wiki/YAMLColonInFlowContext for details.");
2011 }
2012 if (length == 0) {
2013 break;
2014 }
2015 this.allowSimpleKey = false;
2016 chunks.append(spaces);
2017 chunks.append(reader.prefixForward(length));
2018 endMark = reader.getMark();
2019 spaces = scanPlainSpaces();
2020 // System.out.printf("spaces[%s]\n", spaces);
2021 if (spaces.length() == 0 || reader.peek() == '#'
2022 || (this.flowLevel == 0 && this.reader.getColumn() < indent)) {
2023 break;
2024 }
2025 }
2026 return new ScalarToken(chunks.toString(), startMark, endMark, true);
2027 }
2028
2029 /**
2030 * See the specification for details. SnakeYAML and libyaml allow tabs
2031 * inside plain scalar
2032 */
2033 private String scanPlainSpaces() {
2034 int length = 0;
2035 while (reader.peek(length) == ' ' || reader.peek(length) == '\t') {
2036 length++;
2037 }
2038 String whitespaces = reader.prefixForward(length);
2039 String lineBreak = scanLineBreak();
2040 if (lineBreak.length() != 0) {
2041 this.allowSimpleKey = true;
2042 String prefix = reader.prefix(3);
2043 if ("---".equals(prefix) || "...".equals(prefix)
2044 && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) {
2045 return "";
2046 }
2047 StringBuilder breaks = new StringBuilder();
2048 while (true) {
2049 if (reader.peek() == ' ') {
2050 reader.forward();
2051 } else {
2052 String lb = scanLineBreak();
2053 if (lb.length() != 0) {
2054 breaks.append(lb);
2055 prefix = reader.prefix(3);
2056 if ("---".equals(prefix) || "...".equals(prefix)
2057 && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) {
2058 return "";
2059 }
2060 } else {
2061 break;
2062 }
2063 }
2064 }
2065 if (!"\n".equals(lineBreak)) {
2066 return lineBreak + breaks;
2067 } else if (breaks.length() == 0) {
2068 return " ";
2069 }
2070 return breaks.toString();
2071 }
2072 return whitespaces;
2073 }
2074
2075 /**
2076 * <p>
2077 * Scan a Tag handle. A Tag handle takes one of three forms:
2078 *
2079 * <pre>
2080 * "!" (c-primary-tag-handle)
2081 * "!!" (ns-secondary-tag-handle)
2082 * "!(name)!" (c-named-tag-handle)
2083 * </pre>
2084 *
2085 * Where (name) must be formatted as an ns-word-char.
2086 * </p>
2087 *
2088 * @see http://www.yaml.org/spec/1.1/#c-tag-handle
2089 * @see http://www.yaml.org/spec/1.1/#ns-word-char
2090 *
2091 * <pre>
2092 * See the specification for details.
2093 * For some strange reasons, the specification does not allow '_' in
2094 * tag handles. I have allowed it anyway.
2095 * </pre>
2096 */
2097 private String scanTagHandle(String name, Mark startMark) {
2098 char ch = reader.peek();
2099 if (ch != '!') {
2100 throw new ScannerException("while scanning a " + name, startMark,
2101 "expected '!', but found " + ch + "(" + ((int) ch) + ")", reader.getMark());
2102 }
2103 // Look for the next '!' in the stream, stopping if we hit a
2104 // non-word-character. If the first character is a space, then the
2105 // tag-handle is a c-primary-tag-handle ('!').
2106 int length = 1;
2107 ch = reader.peek(length);
2108 if (ch != ' ') {
2109 // Scan through 0+ alphabetic characters.
2110 // FIXME According to the specification, these should be
2111 // ns-word-char only, which prohibits '_'. This might be a
2112 // candidate for a configuration option.
2113 while (Constant.ALPHA.has(ch)) {
2114 length++;
2115 ch = reader.peek(length);
2116 }
2117 // Found the next non-word-char. If this is not a space and not an
2118 // '!', then this is an error, as the tag-handle was specified as:
2119 // !(name) or similar; the trailing '!' is missing.
2120 if (ch != '!') {
2121 reader.forward(length);
2122 throw new ScannerException("while scanning a " + name, startMark,
2123 "expected '!', but found " + ch + "(" + ((int) ch) + ")", reader.getMark());
2124 }
2125 length++;
2126 }
2127 String value = reader.prefixForward(length);
2128 return value;
2129 }
2130
2131 /**
2132 * <p>
2133 * Scan a Tag URI. This scanning is valid for both local and global tag
2134 * directives, because both appear to be valid URIs as far as scanning is
2135 * concerned. The difference may be distinguished later, in parsing. This
2136 * method will scan for ns-uri-char*, which covers both cases.
2137 * </p>
2138 *
2139 * <p>
2140 * This method performs no verification that the scanned URI conforms to any
2141 * particular kind of URI specification.
2142 * </p>
2143 *
2144 * @see http://www.yaml.org/spec/1.1/#ns-uri-char
2145 */
2146 private String scanTagUri(String name, Mark startMark) {
2147 // See the specification for details.
2148 // Note: we do not check if URI is well-formed.
2149 StringBuilder chunks = new StringBuilder();
2150 // Scan through accepted URI characters, which includes the standard
2151 // URI characters, plus the start-escape character ('%'). When we get
2152 // to a start-escape, scan the escaped sequence, then return.
2153 int length = 0;
2154 char ch = reader.peek(length);
2155 while (Constant.URI_CHARS.has(ch)) {
2156 if (ch == '%') {
2157 chunks.append(reader.prefixForward(length));
2158 length = 0;
2159 chunks.append(scanUriEscapes(name, startMark));
2160 } else {
2161 length++;
2162 }
2163 ch = reader.peek(length);
2164 }
2165 // Consume the last "chunk", which would not otherwise be consumed by
2166 // the loop above.
2167 if (length != 0) {
2168 chunks.append(reader.prefixForward(length));
2169 length = 0;
2170 }
2171 if (chunks.length() == 0) {
2172 // If no URI was found, an error has occurred.
2173 throw new ScannerException("while scanning a " + name, startMark,
2174 "expected URI, but found " + ch + "(" + ((int) ch) + ")", reader.getMark());
2175 }
2176 return chunks.toString();
2177 }
2178
2179 /**
2180 * <p>
2181 * Scan a sequence of %-escaped URI escape codes and convert them into a
2182 * String representing the unescaped values.
2183 * </p>
2184 *
2185 * FIXME This method fails for more than 256 bytes' worth of URI-encoded
2186 * characters in a row. Is this possible? Is this a use-case?
2187 *
2188 * @see http://www.ietf.org/rfc/rfc2396.txt, section 2.4, Escaped Encoding.
2189 */
2190 private String scanUriEscapes(String name, Mark startMark) {
2191 // First, look ahead to see how many URI-escaped characters we should
2192 // expect, so we can use the correct buffer size.
2193 int length = 1;
2194 while (reader.peek(length * 3) == '%') {
2195 length++;
2196 }
2197 // See the specification for details.
2198 // URIs containing 16 and 32 bit Unicode characters are
2199 // encoded in UTF-8, and then each octet is written as a
2200 // separate character.
2201 Mark beginningMark = reader.getMark();
2202 ByteBuffer buff = ByteBuffer.allocate(length);
2203 while (reader.peek() == '%') {
2204 reader.forward();
2205 try {
2206 byte code = (byte) Integer.parseInt(reader.prefix(2), 16);
2207 buff.put(code);
2208 } catch (NumberFormatException nfe) {
2209 throw new ScannerException("while scanning a " + name, startMark,
2210 "expected URI escape sequence of 2 hexadecimal numbers, but found "
2211 + reader.peek() + "(" + ((int) reader.peek()) + ") and "
2212 + reader.peek(1) + "(" + ((int) reader.peek(1)) + ")",
2213 reader.getMark());
2214 }
2215 reader.forward(2);
2216 }
2217 buff.flip();
2218 try {
2219 return UriEncoder.decode(buff);
2220 } catch (CharacterCodingException e) {
2221 throw new ScannerException("while scanning a " + name, startMark,
2222 "expected URI in UTF-8: " + e.getMessage(), beginningMark);
2223 }
2224 }
2225
2226 /**
2227 * Scan a line break, transforming:
2228 *
2229 * <pre>
2230 * '\r\n' : '\n'
2231 * '\r' : '\n'
2232 * '\n' : '\n'
2233 * '\x85' : '\n'
2234 * default : ''
2235 * </pre>
2236 */
2237 private String scanLineBreak() {
2238 // Transforms:
2239 // '\r\n' : '\n'
2240 // '\r' : '\n'
2241 // '\n' : '\n'
2242 // '\x85' : '\n'
2243 // default : ''
2244 char ch = reader.peek();
2245 if (ch == '\r' || ch == '\n' || ch == '\u0085') {
2246 if (ch == '\r' && '\n' == reader.peek(1)) {
2247 reader.forward(2);
2248 } else {
2249 reader.forward();
2250 }
2251 return "\n";
2252 } else if (ch == '\u2028' || ch == '\u2029') {
2253 reader.forward();
2254 return String.valueOf(ch);
2255 }
2256 return "";
2257 }
2258
2259 /**
2260 * Chomping the tail may have 3 values - yes, no, not defined.
2261 */
2262 private static class Chomping {
2263 private final Boolean value;
2264 private final int increment;
2265
2266 public Chomping(Boolean value, int increment) {
2267 this.value = value;
2268 this.increment = increment;
2269 }
2270
2271 public boolean chompTailIsNotFalse() {
2272 return value == null || value;
2273 }
2274
2275 public boolean chompTailIsTrue() {
2276 return value != null && value;
2277 }
2278
2279 public int getIncrement() {
2280 return increment;
2281 }
2282 }
2283 }