com.github.hronom.scrape.dat.website.controllers.ScrapeButtonController.java Source code

Java tutorial

Introduction

Here is the source code for com.github.hronom.scrape.dat.website.controllers.ScrapeButtonController.java

Source

package com.github.hronom.scrape.dat.website.controllers;

import com.gargoylesoftware.htmlunit.AjaxController;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebRequest;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.github.hronom.scrape.dat.website.views.ScrapeView;
import com.teamdev.jxbrowser.chromium.Browser;
import com.ui4j.api.browser.BrowserEngine;
import com.ui4j.api.browser.BrowserFactory;
import com.ui4j.api.browser.Page;

import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.net.URL;
import java.util.concurrent.Executors;

public class ScrapeButtonController {
    private static final Logger logger = LogManager.getLogger();

    private final ScrapeView scrapeView;

    private final WebClient webClient;
    private final BrowserEngine browserEngine;
    private final Browser browser;

    public ScrapeButtonController(ScrapeView scrapeViewArg) {
        scrapeView = scrapeViewArg;
        scrapeView.addScrapeButtonActionListener(createScrapeButtonActionListener());

        // Create HtmlUnit WebClient.
        {
            webClient = new WebClient(BrowserVersion.FIREFOX_38);
            webClient.getOptions().setCssEnabled(true);
            webClient.getOptions().setJavaScriptEnabled(true);
            webClient.getOptions().setPopupBlockerEnabled(false);
            webClient.getOptions().setRedirectEnabled(true);
            webClient.getOptions().setActiveXNative(true);
            webClient.getOptions().setAppletEnabled(true);
            webClient.getOptions().setUseInsecureSSL(true);
            webClient.getOptions().setThrowExceptionOnScriptError(false);
            webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
            webClient.getCookieManager().setCookiesEnabled(true);
            webClient.setAjaxController(new AjaxController() {
                @Override
                public boolean processSynchron(HtmlPage page, WebRequest request, boolean async) {
                    return true;
                }
            });
        }

        // Create Ui4j BrowserEngine.
        {
            browserEngine = BrowserFactory.getWebKit();
        }

        // JxBrowser.
        {
            System.setProperty("teamdev.license.info", "true");
            browser = new Browser();
            //        BrowserView browserView = new BrowserView(browser);
            //
            //        JFrame frame = new JFrame();
            //        frame.setDefaultCloseOperation(WindowConstants.EXIT_ON_CLOSE);
            //        frame.getContentPane().add(browserView, BorderLayout.CENTER);
            //        frame.setSize(800, 600);
            //        frame.setLocationRelativeTo(null);
            //        frame.setVisible(true);
            //
            //        browser.addLoadListener(new LoadAdapter() {
            //            @Override
            //            public void onFinishLoadingFrame(FinishLoadingEvent event) {
            //                if (event.isMainFrame()) {
            //                    Browser browser = event.getBrowser();
            //                    DOMDocument document = browser.getDocument();
            //                    System.out.println("document = " + document);
            //                }
            //            }
            //        });
        }
    }

    public ActionListener createScrapeButtonActionListener() {
        return new ActionListener() {
            @Override
            public void actionPerformed(ActionEvent event) {
                Executors.newSingleThreadExecutor().submit(new Runnable() {
                    public void run() {
                        String selectedBrowserEngine = scrapeView.getSelectedBrowserEngine();
                        switch (selectedBrowserEngine) {
                        case "HtmlUnit":
                            processByHtmlUnit();
                            break;
                        case "Ui4j":
                            processByUi4j();
                            break;
                        case "JxBrowser":
                            processByJxBrowser();
                            break;
                        default:
                            logger.error("Unknown browser engine: " + selectedBrowserEngine);
                            break;
                        }
                    }
                });
            }
        };
    }

    public void processByHtmlUnit() {
        // Disable fields in view.
        scrapeView.setWebsiteUrlTextFieldEnabled(false);
        scrapeView.setSelectorTextFieldEnabled(false);
        scrapeView.setScrapeButtonEnabled(false);
        scrapeView.setWorkInProgress(true);
        scrapeView.setOutput("");

        scrapeView.setProgressBarTaskText("initializing");
        logger.info("Start processing...");
        long beginTime = System.currentTimeMillis();

        // Output input parameters.
        if (!scrapeView.getWebsiteUrl().isEmpty() && !scrapeView.getSelector().isEmpty()) {
            logger.info("Input parameters: \"" + scrapeView.getWebsiteUrl() + "\", \"" + scrapeView.getSelector()
                    + "\", \"");
        }

        // Process.
        try {
            URL url = new URL(scrapeView.getWebsiteUrl());
            scrapeView.setProgressBarTaskText("requesting page");
            logger.info("Requesting page...");
            HtmlPage page = webClient.getPage(url);
            logger.info("Requesting of page completed.");

            scrapeView.setProgressBarTaskText("viewing page as XML");
            logger.info("View page as XML");
            String xml = page.asXml();

            // Unescape html.
            scrapeView.setProgressBarTaskText("unescaping HTML");
            logger.info("Unescape html");
            xml = StringEscapeUtils.unescapeHtml4(xml);

            logger.info("Get selector");
            String selector = scrapeView.getSelector();
            if (!xml.isEmpty() && !selector.isEmpty()) {
                scrapeView.setProgressBarTaskText("parsing HTML");
                logger.info("Parse HTML");
                Document doc = Jsoup.parse(xml);

                scrapeView.setProgressBarTaskText("selecting elements in HTML");
                logger.info("select elements in HTML");
                Elements selectedElements = doc.select(selector);

                if (!selectedElements.isEmpty()) {
                    scrapeView.setProgressBarTaskText("parsing selected elements");
                    logger.info("Parse extracted elements");
                    StringBuilder sb = new StringBuilder();
                    for (Element element : selectedElements) {
                        String body = element.html();
                        sb.append(body);
                        sb.append("\n");
                        sb.append("\n");
                    }
                    scrapeView.setOutput(sb.toString());
                }
            }
        } catch (Exception e) {
            logger.error(e);
        }

        webClient.close();

        long endTime = System.currentTimeMillis();
        logger.info("Process time: " + (endTime - beginTime) + " ms.");
        logger.info("Processing complete.");

        // Enable fields in view.
        scrapeView.setWorkInProgress(false);
        scrapeView.setScrapeButtonEnabled(true);
        scrapeView.setSelectorTextFieldEnabled(true);
        scrapeView.setWebsiteUrlTextFieldEnabled(true);
    }

    public void processByUi4j() {
        // Disable fields in view.
        scrapeView.setWebsiteUrlTextFieldEnabled(false);
        scrapeView.setSelectorTextFieldEnabled(false);
        scrapeView.setScrapeButtonEnabled(false);
        scrapeView.setWorkInProgress(true);
        scrapeView.setOutput("");

        scrapeView.setProgressBarTaskText("initializing");
        logger.info("Start processing...");
        long beginTime = System.currentTimeMillis();

        // Output input parameters.
        if (!scrapeView.getWebsiteUrl().isEmpty() && !scrapeView.getSelector().isEmpty()) {
            logger.info("Input parameters: \"" + scrapeView.getWebsiteUrl() + "\", \"" + scrapeView.getSelector()
                    + "\", \"");
        }

        // Navigate to blank page.
        scrapeView.setProgressBarTaskText("requesting page");
        logger.info("Requesting page...");
        Page page = browserEngine.navigate(scrapeView.getWebsiteUrl());
        //page.show();
        logger.info("Requesting of page completed.");

        scrapeView.setProgressBarTaskText("viewing page as HTML");
        logger.info("View page as HTML");
        String html = page.getDocument().getBody().getInnerHTML();

        // Unescape html.
        scrapeView.setProgressBarTaskText("unescaping HTML");
        logger.info("Unescape html");
        html = StringEscapeUtils.unescapeHtml4(html);

        logger.info("Get selector");
        String selector = scrapeView.getSelector();
        if (!html.isEmpty() && !selector.isEmpty()) {
            scrapeView.setProgressBarTaskText("parsing HTML");
            logger.info("Parse HTML");
            Document doc = Jsoup.parse(html);

            scrapeView.setProgressBarTaskText("selecting elements in HTML");
            logger.info("select elements in HTML");
            Elements selectedElements = doc.select(selector);

            if (!selectedElements.isEmpty()) {
                scrapeView.setProgressBarTaskText("parsing selected elements");
                logger.info("Parse extracted elements");
                StringBuilder sb = new StringBuilder();
                for (Element element : selectedElements) {
                    String body = element.html();
                    sb.append(body);
                    sb.append("\n");
                    sb.append("\n");
                }
                scrapeView.setOutput(sb.toString());
            }
        }

        browserEngine.clearCookies();

        long endTime = System.currentTimeMillis();
        logger.info("Process time: " + (endTime - beginTime) + " ms.");
        logger.info("Processing complete.");

        // Enable fields in view.
        scrapeView.setWorkInProgress(false);
        scrapeView.setScrapeButtonEnabled(true);
        scrapeView.setSelectorTextFieldEnabled(true);
        scrapeView.setWebsiteUrlTextFieldEnabled(true);
    }

    public void processByJxBrowser() {
        // Disable fields in view.
        scrapeView.setWebsiteUrlTextFieldEnabled(false);
        scrapeView.setSelectorTextFieldEnabled(false);
        scrapeView.setScrapeButtonEnabled(false);
        scrapeView.setWorkInProgress(true);
        scrapeView.setOutput("");

        scrapeView.setProgressBarTaskText("initializing");
        logger.info("Start processing...");
        long beginTime = System.currentTimeMillis();

        // Output input parameters.
        if (!scrapeView.getWebsiteUrl().isEmpty() && !scrapeView.getSelector().isEmpty()) {
            logger.info("Input parameters: \"" + scrapeView.getWebsiteUrl() + "\", \"" + scrapeView.getSelector()
                    + "\", \"");
        }

        // Navigate to blank page.
        scrapeView.setProgressBarTaskText("requesting page");
        logger.info("Requesting page...");
        browser.loadURL(scrapeView.getWebsiteUrl());
        // Wait for loading.
        while (browser.isLoading()) {
            try {
                Thread.sleep(1000);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }
        logger.info("Requesting of page completed.");

        scrapeView.setProgressBarTaskText("viewing page as HTML");
        logger.info("View page as HTML");
        String html = browser.getHTML();

        // Unescape html.
        scrapeView.setProgressBarTaskText("unescaping HTML");
        logger.info("Unescape html");
        html = StringEscapeUtils.unescapeHtml4(html);

        logger.info("Get selector");
        String selector = scrapeView.getSelector();
        if (!html.isEmpty() && !selector.isEmpty()) {
            scrapeView.setProgressBarTaskText("parsing HTML");
            logger.info("Parse HTML");
            Document doc = Jsoup.parse(html);

            scrapeView.setProgressBarTaskText("selecting elements in HTML");
            logger.info("select elements in HTML");
            Elements selectedElements = doc.select(selector);

            if (!selectedElements.isEmpty()) {
                scrapeView.setProgressBarTaskText("parsing selected elements");
                logger.info("Parse extracted elements");
                StringBuilder sb = new StringBuilder();
                for (Element element : selectedElements) {
                    String body = element.html();
                    sb.append(body);
                    sb.append("\n");
                    sb.append("\n");
                }
                scrapeView.setOutput(sb.toString());
            }
        }

        browser.stop();

        long endTime = System.currentTimeMillis();
        logger.info("Process time: " + (endTime - beginTime) + " ms.");
        logger.info("Processing complete.");

        // Enable fields in view.
        scrapeView.setWorkInProgress(false);
        scrapeView.setScrapeButtonEnabled(true);
        scrapeView.setSelectorTextFieldEnabled(true);
        scrapeView.setWebsiteUrlTextFieldEnabled(true);
    }
}