Here you can find the source of getCharsetFromDocument(ByteBuffer bb)
Parameter | Description |
---|---|
bb | The document to search |
private static String getCharsetFromDocument(ByteBuffer bb)
//package com.java2s; /* _______ __ __ _______ __ * | __|__| |.--.--.-----.----.|_ _|.----.-----.--.--.| |_ * |__ | | || | | -__| _| | | | _| _ | | || _| * |_______|__|__| \___/|_____|__| |___| |__| |_____|_____||____| * * Copyright 2008 - Gustav Tiger, Henrik Steen and Gustav "Gussoh" Sohtell * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version./*from ww w . j a va 2 s . c o m*/ * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ import java.nio.ByteBuffer; import java.util.regex.Pattern; import java.util.regex.Matcher; public class Main { private static final String fallbackCharset = "iso-8859-1"; /** * Get charset from a document. * * This function searches the document for meta tags and parses them, and * searches for a content-type httpequiv with some charset. * * @param bb The document to search * * @return The charset, if found. Otherwise null. * */ private static String getCharsetFromDocument(ByteBuffer bb) { String patternMeta = "(?i)<meta\\s([^>]*)>"; Pattern pm = Pattern.compile(patternMeta); Matcher mm; try { mm = pm.matcher(new String(bb.array(), fallbackCharset)); } catch (Exception e) { e.printStackTrace(); return null; } while (mm.find()) { String patternAttrib = "(?i)([a-z\\-]+)=(\"|')([^\"|']*)(\"|')"; Pattern pa = Pattern.compile(patternAttrib); Matcher ma = pa.matcher(mm.group(1)); System.out.println(mm.group(1)); String httpEquiv = null, content = null; while (ma.find()) { System.out.println(ma.group(1) + ": " + ma.group(3)); if (ma.group(1).equalsIgnoreCase("http-equiv")) { httpEquiv = ma.group(3); } else if (ma.group(1).equalsIgnoreCase("content")) { content = ma.group(3); } } if (httpEquiv != null && content != null) { if (httpEquiv.equalsIgnoreCase("Content-Type")) { System.out.println("Found charset in meta"); System.out.println(httpEquiv + ", " + content); return getCharset(content); } } } return null; } /** * Extracts a charset from a Content-Type. * * @param contentType The Content-Type to extract from * @return The extracted charset. Null if no charset was found. * */ private static String getCharset(String contentType) { String[] parameters = contentType.split(";"); for (int i = 1; i < parameters.length; i++) { String parameter = parameters[i]; if (parameter.indexOf('=') != -1) { int split = parameter.indexOf('='); String key = parameter.substring(0, split).trim(); String value = parameter.substring(split + 1).trim(); if (key.equalsIgnoreCase("charset")) { // XXX: This should be enough for quoted strings. We should // not have any special chars in our charsets. if (value.startsWith("\"")) { return value.substring(1, value.length() - 1); } return value; } } } return null; } }