org.apache.nutch.protocol.selenium.HttpWebClient.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.nutch.protocol.selenium.HttpWebClient.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.protocol.selenium;

import java.lang.invoke.MethodHandles;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.util.concurrent.TimeUnit;
import java.util.Random;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;

import org.openqa.selenium.By;
import org.openqa.selenium.Capabilities;
import org.openqa.selenium.OutputType;
import org.openqa.selenium.TakesScreenshot;
import org.openqa.selenium.TimeoutException;
import org.openqa.selenium.WebDriver;

import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;

//import org.openqa.selenium.firefox.FirefoxBinary;
import org.openqa.selenium.firefox.FirefoxDriver;
//import org.openqa.selenium.firefox.FirefoxProfile;
import org.openqa.selenium.firefox.FirefoxOptions;

import org.openqa.selenium.io.TemporaryFilesystem;

import org.openqa.selenium.remote.DesiredCapabilities;
import org.openqa.selenium.remote.RemoteWebDriver;

//import org.openqa.selenium.safari.SafariDriver;

//import org.openqa.selenium.phantomjs.PhantomJSDriver;
//import org.openqa.selenium.phantomjs.PhantomJSDriverService;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.openqa.selenium.opera.OperaOptions;
import org.openqa.selenium.opera.OperaDriver;
//import com.opera.core.systems.OperaDriver;

public class HttpWebClient {

    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

    public static WebDriver getDriverForPage(String url, Configuration conf) {
        WebDriver driver = null;
        long pageLoadWait = conf.getLong("page.load.delay", 3);

        try {
            String driverType = conf.get("selenium.driver", "firefox");
            boolean enableHeadlessMode = conf.getBoolean("selenium.enable.headless", false);

            switch (driverType) {
            case "firefox":
                String geckoDriverPath = conf.get("selenium.grid.binary", "/root/geckodriver");
                driver = createFirefoxWebDriver(geckoDriverPath, enableHeadlessMode);
                break;
            case "chrome":
                String chromeDriverPath = conf.get("selenium.grid.binary", "/root/chromedriver");
                driver = createChromeWebDriver(chromeDriverPath, enableHeadlessMode);
                break;
            // case "opera":
            // // This class is provided as a convenience for easily testing the
            // Chrome browser.
            // String operaDriverPath = conf.get("selenium.grid.binary",
            // "/root/operadriver");
            // driver = createOperaWebDriver(operaDriverPath, enableHeadlessMode);
            // break;
            case "remote":
                String seleniumHubHost = conf.get("selenium.hub.host", "localhost");
                int seleniumHubPort = Integer.parseInt(conf.get("selenium.hub.port", "4444"));
                String seleniumHubPath = conf.get("selenium.hub.path", "/wd/hub");
                String seleniumHubProtocol = conf.get("selenium.hub.protocol", "http");
                URL seleniumHubUrl = new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort,
                        seleniumHubPath);

                String seleniumGridDriver = conf.get("selenium.grid.driver", "firefox");

                switch (seleniumGridDriver) {
                case "firefox":
                    driver = createFirefoxRemoteWebDriver(seleniumHubUrl, enableHeadlessMode);
                    break;
                case "chrome":
                    driver = createChromeRemoteWebDriver(seleniumHubUrl, enableHeadlessMode);
                    break;
                case "random":
                    driver = createRandomRemoteWebDriver(seleniumHubUrl, enableHeadlessMode);
                    break;
                default:
                    LOG.error(
                            "The Selenium Grid WebDriver choice {} is not available... defaulting to FirefoxDriver().",
                            driverType);
                    driver = createDefaultRemoteWebDriver(seleniumHubUrl, enableHeadlessMode);
                    break;
                }
                break;
            default:
                LOG.error("The Selenium WebDriver choice {} is not available... defaulting to FirefoxDriver().",
                        driverType);
                FirefoxOptions options = new FirefoxOptions();
                driver = new FirefoxDriver(options);
                break;
            }
            LOG.debug("Selenium {} WebDriver selected.", driverType);

            driver.manage().timeouts().pageLoadTimeout(pageLoadWait, TimeUnit.SECONDS);
            driver.get(url);
        } catch (Exception e) {
            if (e instanceof TimeoutException) {
                LOG.error("Selenium WebDriver: Timeout Exception: Capturing whatever loaded so far...");
                return driver;
            } else {
                LOG.error(e.toString());
            }
            cleanUpDriver(driver);
            throw new RuntimeException(e);
        }

        return driver;
    }

    public static WebDriver createFirefoxWebDriver(String firefoxDriverPath, boolean enableHeadlessMode) {
        System.setProperty("webdriver.gecko.driver", firefoxDriverPath);
        FirefoxOptions firefoxOptions = new FirefoxOptions();
        if (enableHeadlessMode) {
            firefoxOptions.addArguments("--headless");
        }
        WebDriver driver = new FirefoxDriver(firefoxOptions);
        return driver;
    }

    public static WebDriver createChromeWebDriver(String chromeDriverPath, boolean enableHeadlessMode) {
        // if not specified, WebDriver will search your path for chromedriver
        System.setProperty("webdriver.chrome.driver", chromeDriverPath);
        ChromeOptions chromeOptions = new ChromeOptions();
        chromeOptions.addArguments("--no-sandbox");
        chromeOptions.addArguments("--disable-extensions");
        // be sure to set selenium.enable.headless to true if no monitor attached
        // to your server
        if (enableHeadlessMode) {
            chromeOptions.addArguments("--headless");
        }
        WebDriver driver = new ChromeDriver(chromeOptions);
        return driver;
    }

    public static WebDriver createOperaWebDriver(String operaDriverPath, boolean enableHeadlessMode) {
        // if not specified, WebDriver will search your path for operadriver
        System.setProperty("webdriver.opera.driver", operaDriverPath);
        OperaOptions operaOptions = new OperaOptions();
        // operaOptions.setBinary("/usr/bin/opera");
        operaOptions.addArguments("--no-sandbox");
        operaOptions.addArguments("--disable-extensions");
        // be sure to set selenium.enable.headless to true if no monitor attached
        // to your server
        if (enableHeadlessMode) {
            operaOptions.addArguments("--headless");
        }
        WebDriver driver = new OperaDriver(operaOptions);
        return driver;
    }

    public static RemoteWebDriver createFirefoxRemoteWebDriver(URL seleniumHubUrl, boolean enableHeadlessMode) {
        FirefoxOptions firefoxOptions = new FirefoxOptions();
        if (enableHeadlessMode) {
            firefoxOptions.setHeadless(true);
        }
        RemoteWebDriver driver = new RemoteWebDriver(seleniumHubUrl, firefoxOptions);
        return driver;
    }

    public static RemoteWebDriver createChromeRemoteWebDriver(URL seleniumHubUrl, boolean enableHeadlessMode) {
        ChromeOptions chromeOptions = new ChromeOptions();
        if (enableHeadlessMode) {
            chromeOptions.setHeadless(true);
        }
        RemoteWebDriver driver = new RemoteWebDriver(seleniumHubUrl, chromeOptions);
        return driver;
    }

    public static RemoteWebDriver createRandomRemoteWebDriver(URL seleniumHubUrl, boolean enableHeadlessMode) {
        // we consider a possibility of generating only 2 types of browsers: Firefox
        // and
        // Chrome only
        Random r = new Random();
        int min = 0;
        // we have actually hardcoded the maximum number of types of web driver that
        // can
        // be created
        // but this must be later moved to the configuration file in order to be
        // able
        // to randomly choose between much more types(ex: Edge, Opera, Safari)
        int max = 1; // for 3 types, change to 2 and update the if-clause
        int num = r.nextInt((max - min) + 1) + min;
        if (num == 0) {
            return createFirefoxRemoteWebDriver(seleniumHubUrl, enableHeadlessMode);
        }

        return createChromeRemoteWebDriver(seleniumHubUrl, enableHeadlessMode);
    }

    public static RemoteWebDriver createDefaultRemoteWebDriver(URL seleniumHubUrl, boolean enableHeadlessMode) {
        return createFirefoxRemoteWebDriver(seleniumHubUrl, enableHeadlessMode);
    }

    public static void cleanUpDriver(WebDriver driver) {
        if (driver != null) {
            try {
                // driver.close();
                driver.quit();
                TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
            } catch (Exception e) {
                LOG.error(e.toString());
                // throw new RuntimeException(e);
            }
        }
    }

    /**
     * Function for obtaining the HTML BODY using the selected <a href=
     * 'https://seleniumhq.github.io/selenium/docs/api/java/org/openqa/selenium/WebDriver.html'>selenium
     * webdriver</a> There are a number of configuration properties within
     * <code>nutch-site.xml</code> which determine whether to take screenshots of
     * the rendered pages and persist them as timestamped .png's into HDFS.
     * 
     * @param url
     *          the URL to fetch and render
     * @param conf
     *          the {@link org.apache.hadoop.conf.Configuration}
     * @return the rendered inner HTML page
     */
    public static String getHtmlPage(String url, Configuration conf) {
        WebDriver driver = getDriverForPage(url, conf);

        try {
            if (conf.getBoolean("take.screenshot", false)) {
                takeScreenshot(driver, conf);
            }

            String innerHtml = driver.findElement(By.tagName("body")).getAttribute("innerHTML");
            return innerHtml;

            // I'm sure this catch statement is a code smell ; borrowing it from
            // lib-htmlunit
        } catch (Exception e) {
            TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
            // throw new RuntimeException(e);
            LOG.error("getHtmlPage(url, conf): " + e.toString());
            throw new RuntimeException(e);
        } finally {
            cleanUpDriver(driver);
        }
    }

    public static String getHtmlPage(String url) {
        return getHtmlPage(url, null);
    }

    private static void takeScreenshot(WebDriver driver, Configuration conf) {
        try {
            String url = driver.getCurrentUrl();
            File srcFile = ((TakesScreenshot) driver).getScreenshotAs(OutputType.FILE);
            LOG.debug("In-memory screenshot taken of: {}", url);
            FileSystem fs = FileSystem.get(conf);
            if (conf.get("screenshot.location") != null) {
                Path screenshotPath = new Path(conf.get("screenshot.location") + "/" + srcFile.getName());
                OutputStream os = null;
                if (!fs.exists(screenshotPath)) {
                    LOG.debug("No existing screenshot already exists... creating new file at {} {}.",
                            screenshotPath, srcFile.getName());
                    os = fs.create(screenshotPath);
                }
                InputStream is = new BufferedInputStream(new FileInputStream(srcFile));
                IOUtils.copyBytes(is, os, conf);
                LOG.debug("Screenshot for {} successfully saved to: {} {}", url, screenshotPath, srcFile.getName());
            } else {
                LOG.warn("Screenshot for {} not saved to HDFS (subsequently disgarded) as value for "
                        + "'screenshot.location' is absent from nutch-site.xml.", url);
            }
        } catch (Exception e) {
            LOG.error("Error taking screenshot: ", e);
            cleanUpDriver(driver);
            throw new RuntimeException(e);
        }
    }
}