1 /**
2 * Copyright (c) 2008-2012, http://www.snakeyaml.org
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.yaml.snakeyaml.reader;
17
18 /**
19 version: 1.1 / 2007-01-25
20 - changed BOM recognition ordering (longer boms first)
21
22 Original pseudocode : Thomas Weidenfeller
23 Implementation tweaked: Aki Nieminen
24 Implementation changed: Andrey Somov
25 * UTF-32 removed because it is not supported by YAML
26 * no default encoding
27
28 http://www.unicode.org/unicode/faq/utf_bom.html
29 BOMs:
30 00 00 FE FF = UTF-32, big-endian
31 FF FE 00 00 = UTF-32, little-endian
32 EF BB BF = UTF-8,
33 FE FF = UTF-16, big-endian
34 FF FE = UTF-16, little-endian
35
36 Win2k Notepad:
37 Unicode format = UTF-16LE
38 ***/
39
40 import java.io.IOException;
41 import java.io.InputStream;
42 import java.io.InputStreamReader;
43 import java.io.PushbackInputStream;
44 import java.io.Reader;
45 import java.nio.charset.Charset;
46 import java.nio.charset.CharsetDecoder;
47 import java.nio.charset.CodingErrorAction;
48
49 /**
50 * Generic unicode textreader, which will use BOM mark to identify the encoding
51 * to be used. If BOM is not found then use a given default or system encoding.
52 */
53 public class UnicodeReader extends Reader {
54 private static final Charset UTF8 = Charset.forName("UTF-8");
55 private static final Charset UTF16BE = Charset.forName("UTF-16BE");
56 private static final Charset UTF16LE = Charset.forName("UTF-16LE");
57
58 PushbackInputStream internalIn;
59 InputStreamReader internalIn2 = null;
60
61 private static final int BOM_SIZE = 3;
62
63 /**
64 * @param in
65 * InputStream to be read
66 */
67 public UnicodeReader(InputStream in) {
68 internalIn = new PushbackInputStream(in, BOM_SIZE);
69 }
70
71 /**
72 * Get stream encoding or NULL if stream is uninitialized. Call init() or
73 * read() method to initialize it.
74 */
75 public String getEncoding() {
76 return internalIn2.getEncoding();
77 }
78
79 /**
80 * Read-ahead four bytes and check for BOM marks. Extra bytes are unread
81 * back to the stream, only BOM bytes are skipped.
82 */
83 protected void init() throws IOException {
84 if (internalIn2 != null)
85 return;
86
87 Charset encoding;
88 byte bom[] = new byte[BOM_SIZE];
89 int n, unread;
90 n = internalIn.read(bom, 0, bom.length);
91
92 if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
93 encoding = UTF8;
94 unread = n - 3;
95 } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
96 encoding = UTF16BE;
97 unread = n - 2;
98 } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
99 encoding = UTF16LE;
100 unread = n - 2;
101 } else {
102 // Unicode BOM mark not found, unread all bytes
103 encoding = UTF8;
104 unread = n;
105 }
106
107 if (unread > 0)
108 internalIn.unread(bom, (n - unread), unread);
109
110 // Use given encoding
111 CharsetDecoder decoder = encoding.newDecoder().onUnmappableCharacter(
112 CodingErrorAction.REPORT);
113 internalIn2 = new InputStreamReader(internalIn, decoder);
114 }
115
116 public void close() throws IOException {
117 init();
118 internalIn2.close();
119 }
120
121 public int read(char[] cbuf, int off, int len) throws IOException {
122 init();
123 return internalIn2.read(cbuf, off, len);
124 }
125 }