com.screenslicer.core.scrape.ProcessPage.java Source code

Java tutorial

Introduction

Here is the source code for com.screenslicer.core.scrape.ProcessPage.java

Source

/* 
 * ScreenSlicer (TM) -- automatic, zero-config web scraping (TM)
 * Copyright (C) 2013-2014 Machine Publishers, LLC
 * ops@machinepublishers.com | screenslicer.com | machinepublishers.com
 * 717 Martin Luther King Dr W Ste I, Cincinnati, Ohio 45220
 *
 * You can redistribute this program and/or modify it under the terms of the
 * GNU Affero General Public License version 3 as published by the Free
 * Software Foundation. Additional permissions or commercial licensing may be
 * available--see LICENSE file or contact Machine Publishers, LLC for details.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License version 3
 * for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * version 3 along with this program. If not, see <http://www.gnu.org/licenses/>.
 * 
 * For general details about how to investigate and report license violations,
 * please see: https://www.gnu.org/licenses/gpl-violation.html
 * and email the author: ops@machinepublishers.com
 * Keep in mind that paying customers have more rights than the AGPL alone offers.
 */
package com.screenslicer.core.scrape;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.io.FileUtils;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.NodeVisitor;
import org.openqa.selenium.remote.RemoteWebDriver;

import com.screenslicer.api.datatype.UrlTransform;
import com.screenslicer.common.CommonUtil;
import com.screenslicer.common.Log;
import com.screenslicer.core.scrape.Scrape.ActionFailed;
import com.screenslicer.core.scrape.type.Result;
import com.screenslicer.core.scrape.type.Results;
import com.screenslicer.core.scrape.type.Results.Leniency;
import com.screenslicer.core.util.Util;
import com.screenslicer.webapp.WebApp;

public class ProcessPage {
    private static final int NUM_EXTRACTIONS = 8;

    private static void trim(Element body) {
        final List<Node> toRemove = new ArrayList<Node>();
        body.traverse(new NodeVisitor() {
            @Override
            public void tail(Node n, int d) {
            }

            @Override
            public void head(Node node, int d) {
                if (Util.isHidden(node)) {
                    toRemove.add(node);
                }
            }
        });
        for (Node node : toRemove) {
            node.remove();
        }
    }

    public static List<Result> perform(Element element, int page, String query) {
        try {
            trim(element);
            Map<String, Object> cache = new HashMap<String, Object>();
            return perform(element, page, query, "", true, cache);
        } catch (Exception e) {
            Log.exception(e);
        }
        return null;
    }

    public static List<Result> perform(RemoteWebDriver driver, int page, String query, String[] whitelist,
            String[] patterns, UrlTransform[] transforms) throws ActionFailed {
        try {
            Element element = Util.openElement(driver, whitelist, patterns, transforms);
            trim(element);
            if (WebApp.DEBUG) {
                try {
                    FileUtils.writeStringToFile(new File("./" + System.currentTimeMillis()), element.outerHtml());
                } catch (IOException e) {
                }
            }
            Map<String, Object> cache = new HashMap<String, Object>();
            List<Result> results = perform(element, page, query, driver.getCurrentUrl(), true, cache);
            if (results == null || results.isEmpty()) {
                results = perform(element, page, query, driver.getCurrentUrl(), false, cache);
            }
            return results;
        } catch (Throwable t) {
            Log.exception(t);
            throw new ActionFailed(t);
        }
    }

    private static List<Result> perform(Element body, int page, String query, String currentUrl, boolean trim,
            Map<String, Object> cache) {
        Results ret1 = perform(body, page, query, Leniency.Title, trim, cache);
        Results ret2 = null;
        Results ret3 = null;
        if (ret1 == null || ret1.results().isEmpty()) {
            ret2 = perform(body, page, query, Leniency.None, trim, cache);
        } else {
            return finalizeResults(ret1, currentUrl, body, page, query, Leniency.Title, trim, cache);
        }
        if (ret2 == null || ret2.results().isEmpty()) {
            ret3 = perform(body, page, query, Leniency.Url, trim, cache);
        } else {
            return finalizeResults(ret2, currentUrl, body, page, query, Leniency.None, trim, cache);
        }
        if (ret3 == null || ret3.results().isEmpty()) {
            if (ret1 != null && !ret1.results().isEmpty()) {
                return finalizeResults(ret1, currentUrl, body, page, query, Leniency.Title, trim, cache);
            }
            if (ret2 != null && !ret2.results().isEmpty()) {
                return finalizeResults(ret2, currentUrl, body, page, query, Leniency.None, trim, cache);
            }
            if (ret3 != null && !ret3.results().isEmpty()) {
                return finalizeResults(ret3, currentUrl, body, page, query, Leniency.Url, trim, cache);
            }
        } else {
            return finalizeResults(ret3, currentUrl, body, page, query, Leniency.Url, trim, cache);
        }
        return finalizeResults(ret1, currentUrl, body, page, query, Leniency.Title, trim, cache);
    }

    private static List<Result> finalizeResults(Results results, String currentUrl, Element body, int page,
            String query, Leniency leniency, boolean trim, Map<String, Object> cache) {
        if (WebApp.DEBUG) {
            System.out.println("Returning: (leniency) " + leniency.name());
        }
        if (trim && !results.results().isEmpty()) {
            Results untrimmed = perform(body, page, query, leniency, false, cache);
            int trimmedScore = results.fieldScore(true, false);
            int untrimmedScore = untrimmed.fieldScore(true, false);
            if (untrimmedScore > (int) Math.rint(((double) trimmedScore) * 1.05d)) {
                if (WebApp.DEBUG) {
                    System.out.println("Un-trimmed selected.");
                }
                return Util.fixUrls(untrimmed.results(), currentUrl);
            }
        }
        if (WebApp.DEBUG) {
            System.out.println("Trimmed selected.");
        }
        return Util.fixUrls(results.results(), currentUrl);
    }

    private static Results perform(Element body, int page, String query, Leniency leniency, boolean trim,
            Map<String, Object> cache) {
        if (WebApp.DEBUG) {
            System.out.println("-Perform-> " + "leniency=" + leniency.name() + "; trim=" + trim);
        }
        Extract.Cache extractCache = cache.containsKey("extractCache") ? (Extract.Cache) cache.get("extractCache")
                : new Extract.Cache();
        cache.put("extractCache", extractCache);
        List<Integer> scores = new ArrayList<Integer>();
        List<Results> results = new ArrayList<Results>();
        List<Node> ignore = new ArrayList<Node>();
        List<Node> nodes;
        if (!cache.containsKey("extractedNodes")) {
            nodes = new ArrayList<Node>();
            cache.put("extractedNodes", nodes);
            for (int i = 0; i < NUM_EXTRACTIONS;) {
                List<Node> best = Extract.perform(body, page, ignore, extractCache);
                if (best.isEmpty()) {
                    break;
                }
                for (Node node : best) {
                    i++;
                    nodes.add(node);
                    ignore.add(node);
                }
            }
        } else {
            nodes = (List<Node>) cache.get("extractedNodes");
        }
        int pos = 0;
        for (Node node : nodes) {
            Results curResults = createResults(body, page, node, pos++, leniency, query, trim, cache);
            results.add(curResults);
            scores.add(curResults.fieldScore(false, trim));
        }
        int max = CommonUtil.max(scores);
        for (int i = 0; i < results.size(); i++) {
            if (results.get(i) != null && scores.get(i) == max) {
                if (WebApp.DEBUG) {
                    System.out.println("-->results" + (i + 1));
                }
                return results.get(i);
            }
        }
        if (!results.isEmpty() && results.get(0) != null) {
            return results.get(0);
        }
        return Results.resultsNull;
    }

    private static Results createResults(Element body, int page, Node nodeExtract, int pos,
            Results.Leniency leniency, String query, boolean trim, Map<String, Object> cache) {
        if (nodeExtract == null) {
            return Results.resultsNull;
        }
        try {
            if (!cache.containsKey("createResults")) {
                cache.put("createResults", new HashMap<String, Object>());
            }
            return new Results(body, page, nodeExtract, pos, leniency, query, trim,
                    (Map<String, Object>) cache.get("createResults"));
        } catch (Exception e) {
            Log.exception(e);
        }
        return Results.resultsNull;
    }

    public static String infoString(List<Result> results) {
        int count = 0;
        StringBuilder ret = new StringBuilder();
        if (results == null) {
            ret.append("FAIL");
        } else {
            for (Result result : results) {
                ++count;
                ret.append(count + " <> " + result.date() + " <> " + result.url() + " <> " + result.title() + " <> "
                        + result.summary() + "\n");
            }
        }
        return ret.toString();
    }
}