com.gistlabs.mechanize.document.html.JsoupDataUtil.java Source code

Java tutorial

Introduction

Here is the source code for com.gistlabs.mechanize.document.html.JsoupDataUtil.java

Source

/**
 * Copyright (C) 2012-2014 Gist Labs, LLC. (http://gistlabs.com)
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 */
package com.gistlabs.mechanize.document.html;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.Header;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * This file inspired by org.jsoup.helper.DataUtil.getCharsetFromContentType(String) method.
 * @author John Heintz <john@gistlabs.com>
 */
public class JsoupDataUtil {
    private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*\"?([^\\s;\"]*)");
    static final String defaultCharset = "UTF-8"; // used if not found in header or meta charset

    /**
     * Parse out a charset from a content type header.
     * @param header e.g. "text/html; charset=EUC-JP"
     * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased.
     */
    public static String getCharsetFromContentType(Header header) {
        if (header == null || header.getValue() == null || "".equals(header.getValue()))
            return null;
        Matcher m = charsetPattern.matcher(header.getValue());
        if (m.find()) {
            return m.group(1).trim().toUpperCase();
        }
        return null;
    }

    /** Returns the all elements matching any of the given tags (case-insensitive). */
    public static Elements findElementsByTag(Element element, String... tags) {
        List<Element> results = new ArrayList<Element>();

        Set<String> tagSet = new HashSet<String>();
        for (String tag : tags)
            tagSet.add(tag.toLowerCase());
        filterElementsByTag(results, element, tagSet);
        return new Elements(results);
    }

    private static void filterElementsByTag(List<Element> results, Element element, Set<String> tagSet) {
        if (tagSet.contains(element.tag().getName().toLowerCase()))
            results.add(element);

        for (Element child : element.children())
            filterElementsByTag(results, child, tagSet);
    }

    /** Returns the first element found with the given tag (or tag sequence separated by '/') or null. */
    public static Element findFirstByTag(Element element, String tag) {
        return findFirstByTag(element, tag.split("/"), 0);
    }

    private static Element findFirstByTag(Element current, String[] tags, int index) {
        if (index < tags.length) {
            Elements elements = current.getElementsByTag(tags[index]);
            for (Element element : elements) {
                Element result = findFirstByTag(element, tags, index + 1);
                if (result != null)
                    return result;
            }
            return null;
        } else
            return current;
    }

}