Description
Try to find the encoding of a xml file.
License
Apache License
Parameter
Parameter | Description |
---|
file | a parameter |
Exception
Parameter | Description |
---|
IOException | an exception |
Declaration
public static String getEncodingOfXml(File file) throws IOException
Method Source Code
//package com.java2s;
/**// ww w. j a v a2 s . c o m
* Copyright 2009 Welocalize, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
*
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Main {
static public final String UTF8 = "UTF-8";
static public final String UTF16LE = "UTF-16LE";
static public final String UTF16BE = "UTF-16BE";
/**
* Try to find the encoding of a xml file.
*
* @param file
* @return
* @throws IOException
*/
public static String getEncodingOfXml(File file) throws IOException {
byte[] bs = readFile(file, 150);
String encoding = "utf-8";
boolean findEncoding = false;
Map chars = Charset.availableCharsets();
Set keys = chars.keySet();
Iterator iterator = keys.iterator();
Pattern pattern = Pattern.compile("encoding=\"([^\"]*?)\"");
while (iterator.hasNext()) {
encoding = (String) iterator.next();
String s = new String(bs, encoding);
// If "<?xml " can be recognized.
if (s.indexOf("<?xml ") > -1) {
// If the file has assigned the encoding, return the
// assigned recoding.
Matcher matcher = pattern.matcher(s);
if (matcher.find()) {
encoding = matcher.group(1);
findEncoding = true;
} else {
String guessedEncoding = guessEncoding(file);
if (guessedEncoding != null) {
encoding = guessedEncoding;
findEncoding = true;
}
}
break;
}
}
return findEncoding ? encoding : "UTF-8";
}
public static byte[] readFile(File file, int size) throws IOException {
return readFile(new FileInputStream(file), size);
}
/**
* Reads bytes from given input stream with specified length.
*/
public static byte[] readFile(InputStream in, int size) throws IOException {
byte[] b = new byte[size];
try {
in.read(b, 0, size);
} finally {
if (in != null) {
in.close();
}
}
return b;
}
public static String readFile(File file) throws IOException {
FileInputStream in = null;
try {
in = new FileInputStream(file);
byte[] b = new byte[in.available()];
in.read(b, 0, b.length);
return new String(b);
} finally {
if (in != null) {
in.close();
}
}
}
public static String readFile(File file, String encoding) throws IOException {
return readFile(new FileInputStream(file), encoding);
}
/**
* Reads the given input stream to a string content.
*/
public static String readFile(InputStream in, String encoding) throws IOException {
try {
byte[] b = new byte[in.available()];
in.read(b);
return new String(b, encoding);
} finally {
if (in != null) {
in.close();
}
}
}
/**
* Try to guess the file encoding.
* <p>
*
* Only guees encodings of "UTF-8", "UTF-16" or "UTF-16BE".
*
* @param file
* The file needed to guess the encoding.
* @return The encoding, may be null.
* @throws IOException
*/
public static String guessEncoding(File file) throws IOException {
byte[] b = readFile(file, 3);
String guess = null;
if (b[0] == (byte) 0xef && b[1] == (byte) 0xbb && b[2] == (byte) 0xbf)
guess = UTF8;
else if (b[0] == (byte) 0xff && b[1] == (byte) 0xfe)
guess = UTF16LE;
else if (b[0] == (byte) 0xfe && b[1] == (byte) 0xff)
guess = UTF16BE;
return guess;
}
}
Related
- getEncoder()
- getEncoder(String encoding)
- getEncoding(byte[] htmlData)
- getEncoding(OutputStreamWriter inWriter)
- getEncoding(String text)
- getEncodingOption(List options)
- getEncodings()
- getEncodings()