com.jaeksoft.searchlib.crawler.web.browser.BrowserDriver.java Source code

Java tutorial

Introduction

Here is the source code for com.jaeksoft.searchlib.crawler.web.browser.BrowserDriver.java

Source

/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2013 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see <http://www.gnu.org/licenses/>.
 **/

package com.jaeksoft.searchlib.crawler.web.browser;

import java.awt.Rectangle;
import java.awt.image.BufferedImage;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;

import javax.imageio.ImageIO;
import javax.xml.parsers.ParserConfigurationException;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.io.FileUtils;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.htmlcleaner.XPatherException;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.Dimension;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.NoSuchElementException;
import org.openqa.selenium.OutputType;
import org.openqa.selenium.TakesScreenshot;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebDriver.Timeouts;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.interactions.Action;
import org.openqa.selenium.interactions.Actions;
import org.xml.sax.SAXException;

import com.google.common.base.Charsets;
import com.google.common.io.Resources;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.crawler.web.database.CookieItem;
import com.jaeksoft.searchlib.crawler.web.spider.HtmlArchiver;
import com.jaeksoft.searchlib.crawler.web.spider.HttpDownloader;
import com.jaeksoft.searchlib.script.commands.Selectors.Selector;
import com.jaeksoft.searchlib.util.IOUtils;

public abstract class BrowserDriver<T extends WebDriver> implements Closeable {

    protected final BrowserDriverEnum type;
    protected T driver = null;

    protected BrowserDriver(BrowserDriverEnum type) {
        this.type = type;
        driver = initialize();
    }

    protected abstract T initialize();

    @Override
    public void close() throws IOException {
        if (driver == null)
            return;
        driver.quit();
        driver = null;
    }

    final public void get(String sUrl) {
        driver.get(sUrl);
    }

    public BrowserDriverEnum getType() {
        return type;
    }

    public Object javascript(String javascript, boolean faultTolerant, Object... objects)
            throws IOException, SearchLibException {
        try {
            if (!(driver instanceof JavascriptExecutor))
                throw new IOException("The Web driver don't support javascript execution");
            JavascriptExecutor js = (JavascriptExecutor) driver;
            return js.executeScript(javascript, objects);
        } catch (IOException e) {
            if (!faultTolerant)
                throw e;
            Logging.warn(e);
        } catch (Exception e) {
            if (!faultTolerant)
                throw new SearchLibException(e);
            Logging.warn(e);
        }
        return null;
    }

    public List<?> getElementByTag(String tag, boolean faultTolerant) throws IOException, SearchLibException {
        List<?> result = (List<?>) javascript("return document.getElementsByTagName(arguments[0])", faultTolerant,
                tag);
        return result;
    }

    public String getJavascriptInnerHtml() throws IOException, SearchLibException {
        String source = (String) javascript("document.getElementsByTagName('body')[0].innerHTML", false);
        return source;
    }

    private static String XPATH_SCRIPT = null;

    private final synchronized static String getXPath() throws IOException {
        if (XPATH_SCRIPT != null)
            return XPATH_SCRIPT;
        URL url = Resources.getResource("/com/jaeksoft/searchlib/crawler/web/browser/get_xpath.js");
        String content = Resources.toString(url, Charsets.UTF_8);
        BufferedReader br = new BufferedReader(new StringReader(content));
        StringBuilder sb = new StringBuilder();
        String line;
        while ((line = br.readLine()) != null)
            sb.append(line.trim());
        br.close();
        XPATH_SCRIPT = sb.toString();
        return XPATH_SCRIPT;
    }

    public String getXPath(WebElement webElement, boolean faultTolerant) throws IOException, SearchLibException {
        String xPath = (String) javascript(getXPath(), faultTolerant, webElement);
        if (xPath == null)
            Logging.warn("XPATH extraction failed on " + webElement);
        return xPath;
    }

    final public BufferedImage getScreenshot() throws IOException {
        if (!(driver instanceof TakesScreenshot))
            throw new IOException("This browser driver does not support screenshot");
        TakesScreenshot takesScreenshot = (TakesScreenshot) driver;
        byte[] data = takesScreenshot.getScreenshotAs(OutputType.BYTES);
        return ImageIO.read(new ByteArrayInputStream(data));
    }

    final public Rectangle getRectangle(WebElement element) {
        if (element == null)
            return null;
        Rectangle box = new Rectangle(element.getLocation().x, element.getLocation().y, element.getSize().width,
                element.getSize().height);
        return box;
    }

    public String getSourceCode() {
        return driver.getPageSource();
    }

    final public String getSourceCode(String sUrl) {
        get(sUrl);
        return driver.getPageSource();
    }

    final public String getTitle() {
        return driver.getTitle();
    }

    final public String getTitle(String sUrl) {
        get(sUrl);
        return driver.getTitle();
    }

    final public void setSize(int width, int height) throws SearchLibException {
        driver.manage().window().setSize(new Dimension(width, height));
    }

    final public void setTimeouts(Integer pageLoad, Integer script) {
        Timeouts timeOuts = driver.manage().timeouts();
        timeOuts.pageLoadTimeout(pageLoad, TimeUnit.SECONDS);
        timeOuts.setScriptTimeout(script, TimeUnit.SECONDS);
    }

    final public List<WebElement> locateBy(By by) throws SearchLibException {
        return driver.findElements(by);
    }

    final public int locateBy(By by, Collection<WebElement> elements, boolean faultTolerant)
            throws SearchLibException {
        try {
            List<WebElement> list = driver.findElements(by);
            if (list == null)
                return 0;
            elements.addAll(list);
            return list.size();
        } catch (Exception e) {
            if (!faultTolerant)
                throw new SearchLibException("Web element location failed: " + by);
            Logging.warn(e);
            return 0;
        }
    }

    public final List<WebElement> locateBy(WebElement originElement, By by, boolean faultTolerant)
            throws SearchLibException {
        try {
            if (originElement == null)
                return null;
            return originElement.findElements(by);
        } catch (Exception e) {
            if (!faultTolerant)
                throw new SearchLibException("Web element location failed: " + by);
            Logging.warn(e);
            return null;
        }
    }

    final public HtmlArchiver saveArchive(HttpDownloader httpDownloader, File parentDirectory,
            Collection<Selector> selectors) throws ClientProtocolException, IllegalStateException, IOException,
            SearchLibException, URISyntaxException, SAXException, ParserConfigurationException, ClassCastException,
            ClassNotFoundException, InstantiationException, IllegalAccessException, XPatherException {

        URL currentURL = new URL(driver.getCurrentUrl());
        StringReader reader = null;
        try {
            HtmlArchiver archiver = new HtmlArchiver(this, parentDirectory, httpDownloader, currentURL);
            Set<WebElement> disableScriptWebElements = new HashSet<WebElement>();
            Set<String> xPathDisableScriptSet = new HashSet<String>();
            if (selectors != null)
                for (Selector selector : selectors)
                    if (selector.disableScript)
                        locateBy(selector.getBy(), disableScriptWebElements, true);
            for (WebElement webElement : disableScriptWebElements) {
                String xPath = getXPath(webElement, true);
                if (xPath != null)
                    xPathDisableScriptSet.add(xPath);
            }
            archiver.archive(this, xPathDisableScriptSet);
            return archiver;
        } finally {
            IOUtils.close(reader);
        }
    }

    final public String getWindow() {
        return driver.getWindowHandle();
    }

    final public void switchToWindow(String window) {
        driver.switchTo().window(window);
    }

    final public void switchToFrame(WebElement frameWebelement) {
        driver.switchTo().frame(frameWebelement);
    }

    final public void switchToMain() {
        driver.switchTo().defaultContent();
    }

    final public void getFrameSource(WebElement frameWebelement, File captureDirectory) throws IOException {
        if (!captureDirectory.exists())
            captureDirectory.mkdir();
        File sourceFile = new File(captureDirectory, "source.html");
        switchToFrame(frameWebelement);
        FileUtils.write(sourceFile, getSourceCode());
        switchToMain();
    }

    /**
     * Click on the given WebElement using Actions
     * 
     * @param element
     * @return
     */
    public void click(WebElement element) {
        Actions builder = new Actions(driver);
        Action click = builder.moveToElement(element).click(element).build();
        click.perform();
    }

    public void switchToLastWindow() {
        String window = null;
        Iterator<String> iterator = driver.getWindowHandles().iterator();
        while (iterator.hasNext())
            window = iterator.next();
        driver.switchTo().window(window);
    }

    public void openNewWindow() throws IOException, SearchLibException {
        javascript("window.open()", false);
        switchToLastWindow();
    }

    public void closeWindow() {
        driver.close();
    }

    public String getCurrentUrl() {
        return driver.getCurrentUrl();
    }

    public List<CookieItem> getCookies() {
        Set<Cookie> cookies = driver.manage().getCookies();
        if (CollectionUtils.isEmpty(cookies))
            return null;
        List<CookieItem> cookieList = new ArrayList<CookieItem>(cookies.size());
        for (Cookie cookie : cookies) {
            BasicClientCookie basicCookie = new BasicClientCookie(cookie.getName(), cookie.getValue());
            basicCookie.setDomain(cookie.getDomain());
            basicCookie.setExpiryDate(cookie.getExpiry());
            basicCookie.setPath(cookie.getPath());
            basicCookie.setSecure(cookie.isSecure());
            cookieList.add(new CookieItem(basicCookie));
        }
        return cookieList;
    }

    public WebElement getParent(String tagName, WebElement element) {
        try {
            WebElement parent = element.findElement(By.xpath(".."));
            if (parent == null)
                return null;
            if (tagName == null)
                return parent;
            if (tagName.equalsIgnoreCase(parent.getTagName()))
                return parent;
            return getParent(tagName, parent);
        } catch (NoSuchElementException e) {
            Logging.warn(e);
            return null;
        }
    }
}