de.ingrid.mdek.dwr.services.HttpService.java Source code

Introduction

Here is the source code for de.ingrid.mdek.dwr.services.HttpService.java
Source

/*
 * **************************************************-
 * Ingrid Portal MDEK Application
 * ==================================================
 * Copyright (C) 2014 - 2015 wemove digital solutions GmbH
 * ==================================================
 * Licensed under the EUPL, Version 1.1 or  as soon they will be
 * approved by the European Commission - subsequent versions of the
 * EUPL (the "Licence");
 * 
 * You may not use this work except in compliance with the Licence.
 * You may obtain a copy of the Licence at:
 * 
 * http://ec.europa.eu/idabc/eupl5
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the Licence is distributed on an "AS IS" basis,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the Licence for the specific language governing permissions and
 * limitations under the Licence.
 * **************************************************#
 */
package de.ingrid.mdek.dwr.services;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringEscapeUtils;
import org.apache.log4j.Logger;
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.beans.StringBean;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasParentFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;

public class HttpService {

    private final static Logger log = Logger.getLogger(HttpService.class);

    public String getHtmlTitle(String url) {
        try {
            Parser parser = new Parser(url);
            // Find a node that is of type 'TextNode' and is child of a tag named 'title'
            NodeList nodeList = parser.parse(new AndFilter(new NodeClassFilter(TextNode.class),
                    new HasParentFilter(new TagNameFilter("title"))));
            SimpleNodeIterator it = nodeList.elements();
            while (it.hasMoreNodes()) {
                Node n = it.nextNode();
                return StringEscapeUtils.unescapeXml(n.getText());
            }

        } catch (ParserException e) {
            return "";
        }

        return "";
    }

    public String parseHtml(String url, int maxWords) {
        // Parse the given url and return a string with maxWords
        StringBean sb = new StringBean();
        sb.setLinks(false);
        sb.setReplaceNonBreakingSpaces(true);
        sb.setCollapse(true);
        sb.setURL(url);
        String str = sb.getStrings(); // fetch url contents to str

        /*
              // NOTE: First approach was trying to use a regexp to find the first n words in a string.
              // Surprisingly, this is way too slow for n > 15.
              // We now use a more simple regexp and just match it n times.
            
               // Extract the first n words from str
               // Pattern consists of '(word+word_boundary...)*maxWords'
               // The flag DOTALL is required so newlines are also matched by '.'
               Pattern expression = Pattern.compile("^(\\w+\\b.*?){"+maxWords+"}", Pattern.DOTALL);
               Pattern expression = Pattern.compile("^(\\w+\\b\\s*?){"+maxWords+"}");
               Matcher matcher = expression.matcher(str);
               // If the first n words are found, return the match. Otherwise just return str.
               if (matcher.lookingAt()) {
                  return matcher.group();
               } else {
                  return str;
               }
        */
        // regexp for 'word+word_boundary+{whitespace}'
        Pattern expression = Pattern.compile("\\w+\\b\\s*?");
        Matcher matcher = expression.matcher(str);

        // Try to find the expression 'maxWords' times.
        // If the expression is not found (most likely due to the input string containing less than maxWords words),
        // the flag noMatch will be set.
        boolean noMatch = false;
        for (int i = 0; i < maxWords; ++i) {
            if (!matcher.find()) {
                noMatch = true;
                break;
            }
        }

        if (noMatch || maxWords < 1) {
            // str contains less than maxWords words. Just return it.
            return str;
        } else {
            // str contains more than maxWords words. Return a substring till the end of the last match.
            return str.substring(0, matcher.end());
        }
    }
}