Java tutorial
/* * Copyright (c) 2010-2012, Martijn Brinkers, Djigzo. * * This file is part of Djigzo email encryption. * * Djigzo is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License * version 3, 19 November 2007 as published by the Free Software * Foundation. * * Djigzo is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public * License along with Djigzo. If not, see <http://www.gnu.org/licenses/> * * Additional permission under GNU AGPL version 3 section 7 * * If you modify this Program, or any covered work, by linking or * combining it with aspectjrt.jar, aspectjweaver.jar, tyrex-1.0.3.jar, * freemarker.jar, dom4j.jar, mx4j-jmx.jar, mx4j-tools.jar, * spice-classman-1.0.jar, spice-loggerstore-0.5.jar, spice-salt-0.8.jar, * spice-xmlpolicy-1.0.jar, saaj-api-1.3.jar, saaj-impl-1.3.jar, * wsdl4j-1.6.1.jar (or modified versions of these libraries), * containing parts covered by the terms of Eclipse Public License, * tyrex license, freemarker license, dom4j license, mx4j license, * Spice Software License, Common Development and Distribution License * (CDDL), Common Public License (CPL) the licensors of this Program grant * you additional permission to convey the resulting work. */ package mitm.common.util; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.PushbackInputStream; import java.io.Reader; import java.nio.charset.Charset; import mitm.common.locale.CharacterEncoding; import org.apache.commons.lang.ArrayUtils; import com.ibm.icu.text.CharsetDetector; import com.ibm.icu.text.CharsetMatch; /** * Reader that tries to detect the encoding (for example UTF-8) used by the input stream. The first detection step * is to check whether the input contains a byte order mark (BOM). If a BOM is found, the BOM will be used. If * a BOM is not found, the char type will be guessed using a {@link CharsetDetector} * * @author Martijn Brinkers * */ public class AutoDetectUnicodeReader extends Reader { /* * Maximal BOM size */ private static final int BOM_SIZE = 4; /* * Max bytes used for auto detect */ private static final int AUTO_DETECT_BYTES = SizeUtils.KB * 8; /* * The encoding to use */ private String encoding; /* * The confidence level of the detected encoding (0-100) */ private int confidence; /* * The input */ private InputStream input; /* * Required for BOM detection */ private PushbackInputStream pushback; /* * Reader wrapping to input */ private Reader reader; /** * Creates in instance of AutoDetectUnicodeReader that will detect the encoding using BOM or based * in the used characters using a CharsetDetector. */ public AutoDetectUnicodeReader(InputStream input) { this(input, null); } /** * Conveniance constructor. If encoding is non-null, the encoding will not be detected but the given * encoding will be used. This constructor is added for conveniance to make it possible to use this * reader when the encoding is known. */ public AutoDetectUnicodeReader(InputStream input, String encoding) { Check.notNull(input, "input"); this.input = input; this.encoding = encoding; this.pushback = new PushbackInputStream(input, AUTO_DETECT_BYTES); } private int detectBOM() throws IOException { byte bom[] = new byte[BOM_SIZE]; int n, unread; n = input.read(bom, 0, bom.length); if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) { encoding = "UTF-32BE"; unread = n - 4; } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) { encoding = "UTF-32LE"; unread = n - 4; } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) { encoding = "UTF-8"; unread = n - 3; } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) { encoding = "UTF-16BE"; unread = n - 2; } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) { encoding = "UTF-16LE"; unread = n - 2; } else { /* * Unicode BOM mark not found, unread all bytes */ unread = n; } if (unread > 0) { pushback.unread(bom, (n - unread), unread); } return n; } private void detect() throws IOException { if (reader != null) { return; } if (encoding == null) { int n = detectBOM(); /* * if n == -1, the a EOF was detected during BOM detection */ if (encoding == null && n != -1) { /* * No BOM found so auto detect charset */ CharsetDetector detector = new CharsetDetector(); byte[] detectionBuffer = new byte[AUTO_DETECT_BYTES]; int read = pushback.read(detectionBuffer); if (read == -1) { throw new IOException("EOF during detection."); } byte[] detectionBytes = ArrayUtils.subarray(detectionBuffer, 0, read); /* * need to push back the bytes read for detection */ pushback.unread(detectionBytes); detector.setText(detectionBytes); CharsetMatch[] matches = detector.detectAll(); if (matches != null) { /* * There might be multiple encodings. Find the first valid one. */ for (CharsetMatch match : matches) { if (match == null) { continue; } if (Charset.isSupported(match.getName())) { encoding = match.getName(); confidence = match.getConfidence(); break; } } } } if (encoding == null) { /* * fallback to ASCII if no encoding is found. */ encoding = CharacterEncoding.US_ASCII; } } reader = new InputStreamReader(pushback, encoding); } @Override public int read(char[] cbuf, int off, int len) throws IOException { detect(); return reader.read(cbuf, off, len); } @Override public void close() throws IOException { if (reader != null) { reader.close(); } else { input.close(); } } public String getEncoding() { return encoding; } public int getConfidence() { return confidence; } }