Java Charset Create getEncoding(String text)

Description

get Encoding

License

Open Source License

Declaration

private static Charset getEncoding(String text)

Method Source Code

//package com.java2s;
/*---------------------------------------------------------------
*  Copyright 2005 by the Radiological Society of North America
*
*  This source software is released under the terms of the
*  RSNA Public License (http://mirc.rsna.org/rsnapubliclicense)
*----------------------------------------------------------------*/

import java.nio.charset.Charset;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Main {
    public static Charset utf8 = Charset.forName("UTF-8");

    private static Charset getEncoding(String text) {

        //See if this is an xml document with an encoding declaration.
        Pattern xml = Pattern.compile("^\\s*<\\?xml\\s+[^>]*\\s*encoding\\s*=\\s*(\"[^\"]*\")",
                Pattern.DOTALL | Pattern.MULTILINE);
        Matcher xmlMatcher = xml.matcher(text);
        if (xmlMatcher.find())
            return getEncoding(xmlMatcher);

        //See if this is an html document with a charset declaration.
        Pattern html = Pattern.compile(
                "^\\s*<(html|HTML).*<(meta|META)\\s+[^>]*\\s*(charset|CHARSET)\\s*=\\s*(\"[^\"]*\"|[^\"\\s]*)",
                Pattern.DOTALL | Pattern.MULTILINE);
        Matcher htmlMatcher = html.matcher(text);
        if (htmlMatcher.find())
            return getEncoding(htmlMatcher);

        //We don't recognize this document declaration; use UTF-8.
        //Maybe this should actually be ISO-8859-1 since
        //that is the web default encoding, but it is probably
        //better to default to UTF-8 because that will be better
        //for sites in the Far East, and the pain for the Europeans
        //will be minimal.
        return utf8;
    }/*  w  ww  .j a  va  2 s. c om*/

    private static Charset getEncoding(Matcher matcher) {
        int groups = matcher.groupCount();
        String name = matcher.group(groups);
        if (name.startsWith("\""))
            name = name.substring(1);
        if (name.endsWith("\""))
            name = name.substring(0, name.length() - 1);
        try {
            return Charset.forName(name);
        } catch (Exception ex) {
            return utf8;
        }
    }
}

Java Charset Create getEncoding(String text)

Description

License

Declaration

Method Source Code

Related