it.drwolf.ridire.cleaners.ReadabilityCleaner.java Source code

Java tutorial

Introduction

Here is the source code for it.drwolf.ridire.cleaners.ReadabilityCleaner.java

Source

/*******************************************************************************
 * Copyright 2013 Universit degli Studi di Firenze
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package it.drwolf.ridire.cleaners;

import it.drwolf.ridire.cleaners.utils.StringWithEncoding;
import it.drwolf.ridire.entity.CrawledResource;
import it.drwolf.ridire.entity.Parameter;

import java.io.File;
import java.io.IOException;

import javax.persistence.EntityManager;

import org.apache.commons.io.FileUtils;
import org.jboss.seam.annotations.Name;
import org.w3c.css.sac.CSSException;
import org.w3c.css.sac.CSSParseException;
import org.w3c.css.sac.ErrorHandler;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.IncorrectnessListener;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.ScriptResult;
import com.gargoylesoftware.htmlunit.ThreadedRefreshHandler;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

@Name("readabilityCleaner")
public class ReadabilityCleaner {

    public class NoOpErrorHandler implements ErrorHandler {

        public void error(CSSParseException arg0) throws CSSException {
            // TODO Auto-generated method stub

        }

        public void fatalError(CSSParseException arg0) throws CSSException {
            // TODO Auto-generated method stub

        }

        public void warning(CSSParseException arg0) throws CSSException {
            // TODO Auto-generated method stub

        }

    }

    public class NoOpIncorrectnessListener implements IncorrectnessListener {

        public void notify(String arg0, Object arg1) {
            // TODO Auto-generated method stub

        }

    }

    public String getCleanedTextFromString(CrawledResource crawledResource,
            StringWithEncoding rawContentAndEncoding, EntityManager entityManager) {
        String bookmarkJS = entityManager.find(Parameter.class, Parameter.READABILITY_SOURCE.getKey()).getValue();
        String host = entityManager.find(Parameter.class, Parameter.READABILITY_HOSTAPP.getKey()).getValue();
        bookmarkJS = bookmarkJS.replaceAll("@@@HOST@@@", host.trim());
        WebClient webClient = new WebClient();
        webClient = new WebClient(BrowserVersion.FIREFOX_3);
        webClient.setCssEnabled(true);
        webClient.setJavaScriptEnabled(true);
        // vedi FAQ: http://htmlunit.sourceforge.net/faq.html#AJAXDoesNotWork
        webClient.setAjaxController(new NicelyResynchronizingAjaxController());
        webClient.waitForBackgroundJavaScript(5000);
        webClient.waitForBackgroundJavaScriptStartingBefore(5000);
        // i seguenti 4 set servono per limitare i log
        webClient.setHTMLParserListener(null);
        webClient.setIncorrectnessListener(new NoOpIncorrectnessListener());
        webClient.setCssErrorHandler(new NoOpErrorHandler());
        webClient.setPrintContentOnFailingStatusCode(false);
        // questo serve per avere il tipo di errore HTTP
        webClient.setThrowExceptionOnFailingStatusCode(true);
        webClient.setThrowExceptionOnScriptError(false);
        webClient.setRefreshHandler(new ThreadedRefreshHandler());
        // webClient.setJavaScriptTimeout(4000);
        webClient.setTimeout(4000);
        File tmpFile = null;
        HtmlPage htmlPage = null;
        String ret = new String();
        try {
            tmpFile = File.createTempFile("readability", null);
            FileUtils.writeStringToFile(tmpFile, rawContentAndEncoding.getString());
            htmlPage = webClient.getPage(host.trim() + "/tmp/resource.seam?filename=" + tmpFile.getName());
            FileUtils.deleteQuietly(tmpFile);
            if (htmlPage != null) {
                // webClient.setJavaScriptEnabled(false);
                ScriptResult scriptResult = htmlPage.executeJavaScript(bookmarkJS);
                // System.out.println(htmlPage.asXml());
                for (HtmlElement he : htmlPage.getElementsByName("div")) {
                    // System.out.println("-->" + he.getAttribute("id") +
                    // "<--");
                    // System.out.println(he.asText());
                }
                HtmlElement element = htmlPage.getElementById("readability-content");
                if (element != null) {
                    ret = element.asText();
                }
            }
            if (tmpFile != null) {
                FileUtils.deleteQuietly(tmpFile);
            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (RuntimeException re) {
            // TODO: handle exception
            re.printStackTrace();
        } catch (Throwable t) {
            t.printStackTrace();
        } finally {
            webClient.closeAllWindows();
        }
        return ret.trim();
    }
}