1 /**
2 * Copyright (c) 2008-2012, http://www.snakeyaml.org
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.yaml.snakeyaml.issues.issue148;
17
18 import java.util.Formatter;
19
20 import junit.framework.TestCase;
21
22 import org.yaml.snakeyaml.DumperOptions;
23 import org.yaml.snakeyaml.DumperOptions.ScalarStyle;
24 import org.yaml.snakeyaml.Yaml;
25 import org.yaml.snakeyaml.reader.ReaderException;
26
27 public class PrintableUnicodeTest extends TestCase {
28 public void testFFFD() {
29 Yaml yaml = createYaml();
30 String fffd = yaml.dump("\uFFFD");
31 assertEquals("\"\\ufffd\"\n", fffd);
32 }
33
34 public void testSerialization() {
35 // test serialization of all Unicode codepoints
36 Yaml yaml = createYaml();
37 for (int c = Character.MIN_VALUE; c <= Character.MAX_VALUE; c++) {
38 String original = Character.toString((char) c);
39 String serialized = yaml.dump(original);
40
41 // "On output, a YAML processor must only produce these acceptable
42 // characters,
43 // and should also escape all non-printable Unicode characters."
44 for (int i = 0; i < serialized.length(); i++) {
45 int cp = (int) serialized.charAt(i);
46 if (!isAcceptable(cp))
47 fail(String.format(
48 "U+%04x: Serialization produced result with unacceptable U+%04x\n", c,
49 cp));
50 if (!isPrintable(cp))
51 fail(String.format(
52 "U+%04x: Serialization produced result with nonprintable U+%04x\n", c,
53 cp));
54 }
55 }
56 }
57
58 public void testDeserialization() {
59 // test deserialization of non-escaped codepoints
60 for (int c = Character.MIN_VALUE; c <= Character.MAX_VALUE; c++) {
61 // ignore breaks, which have special meaning
62 if (c == 0x0A || c == 0x0D || c == 0x85 || c == 0x2028 || c == 0x2029)
63 continue;
64 if (!isAcceptable(c) || c == 0x27)
65 continue;
66 String expected = Character.toString((char) c);
67 String serialized = "'" + expected + "'";
68
69 String result;
70 try {
71 result = new Yaml().load(serialized).toString();
72 } catch (ReaderException e) {
73 fail(String
74 .format("U+%04x: Deserialization threw ReaderException for an acceptable character\n",
75 c));
76 continue;
77 }
78 if (!result.equals(expected))
79 fail(String.format("U+%04x: Deserialization incorrect: %s\n", c, hexdump(result)));
80 }
81 }
82
83 public void testDeserialization2() {
84 // test deserialization of escaped codepoints
85 // "Any such characters must be presented using escape sequences."
86 for (int c = Character.MIN_VALUE; c <= Character.MAX_VALUE; c++) {
87 String expected = Character.toString((char) c);
88 String serialized = String.format("\"\\u%04x\"", c);
89
90 String result;
91 try {
92 result = new Yaml().load(serialized).toString();
93 } catch (ReaderException e) {
94 fail(String
95 .format("U+%04x: Deserialization threw ReaderException for an acceptable escaped character\n",
96 c));
97 continue;
98 }
99 if (!result.equals(expected))
100 fail(String.format("U+%04x: Deserialization of escaped character incorrect: %s\n",
101 c, hexdump(result)));
102 }
103 }
104
105 private Yaml createYaml() {
106 DumperOptions options = new DumperOptions();
107 options.setAllowUnicode(false);
108 options.setDefaultScalarStyle(ScalarStyle.DOUBLE_QUOTED);
109 return new Yaml(options);
110 }
111
112 /**
113 * Test whether a character is printable, according to the YAML spec.
114 * ('c-printable')
115 */
116 public static boolean isPrintable(int c) {
117 return c == 0x9 || c == 0xA || c == 0xD || (c >= 0x20 && c <= 0x7E) // 8
118 // bit
119 || c == 0x85 || (c >= 0xA0 && c <= 0xD7FF) || (c >= 0xE000 && c <= 0xFFFD) // 16
120 // bit
121 || (c >= 0x10000 && c <= 0x10FFFF); // 32 bit
122 }
123
124 /**
125 * "On input, a YAML processor must accept all printable ASCII characters,
126 * the space, tab, line break, and all Unicode characters beyond #x9F. On
127 * output, a YAML processor must only produce these acceptable characters,
128 * and should also escape all non-printable Unicode characters. The allowed
129 * character range explicitly excludes the surrogate block #xD800-#xDFFF,
130 * DEL #x7F, the C0 control block #x0-#x1F (except for #x9, #xA, and #xD),
131 * the C1 control block #x80-#x9F, #xFFFE, and #xFFFF."
132 */
133 public static boolean isAcceptable(int c) {
134 return (c >= 0x20 && c <= 0x7e // accept all printable ASCII characters,
135 // the space,
136 || c == 0x09 // tab,
137 || c == 0x0A || c == 0x0D || c == 0x85 || c == 0x2028 || c == 0x2029 // line
138 // break,
139 || isUnicodeCharacter(c) && c >= 0x9F // and all Unicode characters
140 // beyond #x9F
141 ) && !( // The allowed character range explicitly excludes
142 c >= 0xD800 && c <= 0xDFFF // the surrogate block #xD800-#xDFFF
143 || c == 0x7f // DEL #x7F,
144 || c <= 0x1F && !(c == 0x09 || c == 0x0A || c == 0x0D) // the
145 // C0
146 // control
147 // block
148 // #x0-#x1F
149 // (except
150 // for
151 // #x9,
152 // #xA,
153 // and
154 // #xD),
155 || c >= 0x80 && c <= 0x9F // the C1 control block
156 // #x80-#x9F,
157 || c == 0xFFFE // #xFFFE,
158 || c == 0xFFFF // and #xFFFF.
159 );
160 }
161
162 /**
163 * Tests whether a codepoint is a designated Unicode noncharacter or not.
164 */
165 public static boolean isUnicodeCharacter(int c) {
166 int plane = c / 0x10000;
167 return !(c >= 0xFDD0 && c <= 0xFDEF) && (plane <= 16 && (c & 0xFFFE) != 0xFFFE);
168 }
169
170 public static String hexdump(String input) {
171 StringBuilder result = new StringBuilder();
172 Formatter formatter = new Formatter(result);
173 for (int i = 0; i < input.length(); i++)
174 formatter.format("%02x ", (int) input.charAt(i));
175 return result.toString();
176 }
177 }