Back to project page texthem.
The source code is released under:
GNU General Public License
If you think the Android project texthem listed in this page is inappropriate, such as containing malicious code/tools or violating the copyright, please email info at java2s dot com, thanks.
/* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1/*w ww. j av a2s.co m*/ * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * The Original Code is Mozilla Universal charset detector code. * * The Initial Developer of the Original Code is * Netscape Communications Corporation. * Portions created by the Initial Developer are Copyright (C) 2001 * the Initial Developer. All Rights Reserved. * * Contributor(s): * Shy Shalom <shooshX@gmail.com> * Kohei TAKETA <k-tak@void.in> (Java port) * * Alternatively, the contents of this file may be used under the terms of * either the GNU General Public License Version 2 or later (the "GPL"), or * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), * in which case the provisions of the GPL or the LGPL are applicable instead * of those above. If you wish to allow use of your version of this file only * under the terms of either the GPL or the LGPL, and not to allow others to * use your version of this file under the terms of the MPL, indicate your * decision by deleting the provisions above and replace them with the notice * and other provisions required by the GPL or the LGPL. If you do not delete * the provisions above, a recipient may use your version of this file under * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ package org.mozilla.universalchardet; import org.mozilla.universalchardet.prober.CharsetProber; import org.mozilla.universalchardet.prober.MBCSGroupProber; import org.mozilla.universalchardet.prober.SBCSGroupProber; import org.mozilla.universalchardet.prober.EscCharsetProber; import org.mozilla.universalchardet.prober.Latin1Prober; import org.mozilla.universalchardet.Constants; public class UniversalDetector { //////////////////////////////////////////////////////////////// // constants //////////////////////////////////////////////////////////////// public static final float SHORTCUT_THRESHOLD = 0.95f; public static final float MINIMUM_THRESHOLD = 0.20f; //////////////////////////////////////////////////////////////// // inner types //////////////////////////////////////////////////////////////// public enum InputState { PURE_ASCII, ESC_ASCII, HIGHBYTE } //////////////////////////////////////////////////////////////// // fields //////////////////////////////////////////////////////////////// private InputState inputState; private boolean done; private boolean start; private boolean gotData; private byte lastChar; private String detectedCharset; private CharsetProber[] probers; private CharsetProber escCharsetProber; private CharsetListener listener; //////////////////////////////////////////////////////////////// // methods //////////////////////////////////////////////////////////////// /** * @param listener a listener object that is notified of * the detected encocoding. Can be null. */ public UniversalDetector(CharsetListener listener) { this.listener = listener; this.escCharsetProber = null; this.probers = new CharsetProber[3]; for (int i=0; i<this.probers.length; ++i) { this.probers[i] = null; } reset(); } public boolean isDone() { return this.done; } /** * @return The detected encoding is returned. If the detector couldn't * determine what encoding was used, null is returned. */ public String getDetectedCharset() { return this.detectedCharset; } public void setListener(CharsetListener listener) { this.listener = listener; } public CharsetListener getListener() { return this.listener; } public void handleData(final byte[] buf, int offset, int length) { if (this.done) { return; } if (length > 0) { this.gotData = true; } if (this.start) { this.start = false; if (length > 3) { int b1 = buf[offset] & 0xFF; int b2 = buf[offset+1] & 0xFF; int b3 = buf[offset+2] & 0xFF; int b4 = buf[offset+3] & 0xFF; switch (b1) { case 0xEF: if (b2 == 0xBB && b3 == 0xBF) { this.detectedCharset = Constants.CHARSET_UTF_8; } break; case 0xFE: if (b2 == 0xFF && b3 == 0x00 && b4 == 0x00) { this.detectedCharset = Constants.CHARSET_X_ISO_10646_UCS_4_3412; } else if (b2 == 0xFF) { this.detectedCharset = Constants.CHARSET_UTF_16BE; } break; case 0x00: if (b2 == 0x00 && b3 == 0xFE && b4 == 0xFF) { this.detectedCharset = Constants.CHARSET_UTF_32BE; } else if (b2 == 0x00 && b3 == 0xFF && b4 == 0xFE) { this.detectedCharset = Constants.CHARSET_X_ISO_10646_UCS_4_2143; } break; case 0xFF: if (b2 == 0xFE && b3 == 0x00 && b4 == 0x00) { this.detectedCharset = Constants.CHARSET_UTF_32LE; } else if (b2 == 0xFE) { this.detectedCharset = Constants.CHARSET_UTF_16LE; } break; } // swich end if (this.detectedCharset != null) { this.done = true; return; } } } // if (start) end int maxPos = offset + length; for (int i=offset; i<maxPos; ++i) { int c = buf[i] & 0xFF; if ((c & 0x80) != 0 && c != 0xA0) { if (this.inputState != InputState.HIGHBYTE) { this.inputState = InputState.HIGHBYTE; if (this.escCharsetProber != null) { this.escCharsetProber = null; } if (this.probers[0] == null) { this.probers[0] = new MBCSGroupProber(); } if (this.probers[1] == null) { this.probers[1] = new SBCSGroupProber(); } if (this.probers[2] == null) { this.probers[2] = new Latin1Prober(); } } } else { if (this.inputState == InputState.PURE_ASCII && (c == 0x1B || (c == 0x7B && this.lastChar == 0x7E))) { this.inputState = InputState.ESC_ASCII; } this.lastChar = buf[i]; } } // for end CharsetProber.ProbingState st; if (this.inputState == InputState.ESC_ASCII) { if (this.escCharsetProber == null) { this.escCharsetProber = new EscCharsetProber(); } st = this.escCharsetProber.handleData(buf, offset, length); if (st == CharsetProber.ProbingState.FOUND_IT) { this.done = true; this.detectedCharset = this.escCharsetProber.getCharSetName(); } } else if (this.inputState == InputState.HIGHBYTE) { for (int i=0; i<this.probers.length; ++i) { st = this.probers[i].handleData(buf, offset, length); if (st == CharsetProber.ProbingState.FOUND_IT) { this.done = true; this.detectedCharset = this.probers[i].getCharSetName(); return; } } } else { // pure ascii // do nothing } } public void dataEnd() { if (!this.gotData) { return; } if (this.detectedCharset != null) { this.done = true; if (this.listener != null) { this.listener.report(this.detectedCharset); } return; } if (this.inputState == InputState.HIGHBYTE) { float proberConfidence; float maxProberConfidence = 0.0f; int maxProber = 0; for (int i=0; i<this.probers.length; ++i) { proberConfidence = this.probers[i].getConfidence(); if (proberConfidence > maxProberConfidence) { maxProberConfidence = proberConfidence; maxProber = i; } } if (maxProberConfidence > MINIMUM_THRESHOLD) { this.detectedCharset = this.probers[maxProber].getCharSetName(); if (this.listener != null) { this.listener.report(this.detectedCharset); } } } else if (this.inputState == InputState.ESC_ASCII) { // do nothing } else { // do nothing } } public void reset() { this.done = false; this.start = true; this.detectedCharset = null; this.gotData = false; this.inputState = InputState.PURE_ASCII; this.lastChar = 0; if (this.escCharsetProber != null) { this.escCharsetProber.reset(); } for (int i=0; i<this.probers.length; ++i) { if (this.probers[i] != null) { this.probers[i].reset(); } } } //////////////////////////////////////////////////////////////// // testing //////////////////////////////////////////////////////////////// public static void main(String[] args) throws Exception { if (args.length != 1) { System.out.println("USAGE: java UniversalDetector filename"); return; } UniversalDetector detector = new UniversalDetector( new CharsetListener() { public void report(String name) { System.out.println("charset = " + name); } } ); byte[] buf = new byte[4096]; java.io.FileInputStream fis = new java.io.FileInputStream(args[0]); int nread; while ((nread = fis.read(buf)) > 0 && !detector.isDone()) { detector.handleData(buf, 0, nread); } detector.dataEnd(); } }