com.screenslicer.core.scrape.QueryKeyword.java Source code

Java tutorial

Introduction

Here is the source code for com.screenslicer.core.scrape.QueryKeyword.java

Source

/* 
 * ScreenSlicer (TM) -- automatic, zero-config web scraping (TM)
 * Copyright (C) 2013-2014 Machine Publishers, LLC
 * ops@machinepublishers.com | screenslicer.com | machinepublishers.com
 * 717 Martin Luther King Dr W Ste I, Cincinnati, Ohio 45220
 *
 * You can redistribute this program and/or modify it under the terms of the
 * GNU Affero General Public License version 3 as published by the Free
 * Software Foundation. Additional permissions or commercial licensing may be
 * available--see LICENSE file or contact Machine Publishers, LLC for details.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License version 3
 * for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * version 3 along with this program. If not, see <http://www.gnu.org/licenses/>.
 * 
 * For general details about how to investigate and report license violations,
 * please see: https://www.gnu.org/licenses/gpl-violation.html
 * and email the author: ops@machinepublishers.com
 * Keep in mind that paying customers have more rights than the AGPL alone offers.
 */
package com.screenslicer.core.scrape;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

import org.openqa.selenium.Dimension;
import org.openqa.selenium.Keys;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.interactions.Actions;
import org.openqa.selenium.remote.RemoteWebDriver;

import com.screenslicer.api.request.KeywordQuery;
import com.screenslicer.common.CommonUtil;
import com.screenslicer.common.Log;
import com.screenslicer.core.scrape.Scrape.ActionFailed;
import com.screenslicer.core.util.Util;

public class QueryKeyword {
    private static final int MIN_IFRAME_AREA = 40000;
    private static final int MIN_SOURCE_DIFF = 500;
    private static final int MOUSE_MOVE_OFFSET = 500;
    private static final int CHARS_TO_REMOVE = 60;
    private static final Pattern searchControl = Pattern.compile(
            "(?:[-._\"]|^|\\b|\\s|/|#)(?:search|magnifying|magnify|?||Recherche|Chercher|Suche||Cerca||?|Buscar)(?:[-._\"]|$|\\b|\\s|/|#)",
            Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS);
    private static final Pattern search = Pattern.compile(
            "search|find|query|keyword|locate|google|results|(?:(?:[-._\"]|^|\\b|\\s|/|#)q(?:[-._\"]|$|\\b|\\s|/|#))|?||Recherche|Chercher|Suche||Cerca||?|Buscar",
            Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS);
    private static final Pattern nonSearch = Pattern.compile(
            "username|e-?mail|user|log\\s?in|uname|city|state|zip|location", //TODO translate
            Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS);
    private static String delete;
    static {
        String key = Keys.BACK_SPACE.toString();
        delete = key;
        for (int i = 1; i < CHARS_TO_REMOVE; i++) {
            delete = delete + key;
        }
        key = Keys.DELETE.toString();
        delete = delete + key;
        for (int i = 1; i < CHARS_TO_REMOVE; i++) {
            delete = delete + key;
        }
    }

    private static List<WebElement> findSearchBox(RemoteWebDriver driver, boolean strict) throws ActionFailed {
        try {
            List<WebElement> searchBoxes = new ArrayList<WebElement>();
            List<WebElement> allInputs = driver.findElementsByTagName("input");
            for (WebElement searchBox : allInputs) {
                if (searchBox.getAttribute("type").equalsIgnoreCase("text")
                        || searchBox.getAttribute("type").equalsIgnoreCase("search")) {
                    searchBoxes.add(searchBox);
                }
            }

            List<WebElement> prioritySearchBoxes = new ArrayList<WebElement>();
            List<WebElement> forbiddenSearchBoxes = new ArrayList<WebElement>();
            for (WebElement searchBox : searchBoxes) {
                String info = new String(searchBox.getAttribute("name") + " " + searchBox.getAttribute("title")
                        + " " + searchBox.getAttribute("value") + " " + searchBox.getAttribute("class") + " "
                        + searchBox.getAttribute("placeholder") + " " + searchBox.getAttribute("id"));
                if (strict && nonSearch.matcher(info).find()) {
                    forbiddenSearchBoxes.add(searchBox);
                } else if (search.matcher(info).find()) {
                    prioritySearchBoxes.add(0, searchBox);
                }
            }
            for (WebElement searchBox : forbiddenSearchBoxes) {
                searchBoxes.remove(searchBox);
            }
            for (WebElement searchBox : prioritySearchBoxes) {
                searchBoxes.remove(searchBox);
                searchBoxes.add(0, searchBox);
            }
            if (!strict && searchBoxes.isEmpty() && allInputs.size() == 1) {
                searchBoxes.add(allInputs.get(0));
            } else if (!strict && searchBoxes.isEmpty() && prioritySearchBoxes.size() == 1) {
                searchBoxes.add(prioritySearchBoxes.get(0));
            }
            return searchBoxes;
        } catch (Throwable t) {
            Log.exception(t);
            throw new ActionFailed(t);
        }
    }

    private static void hoverClick(RemoteWebDriver driver, WebElement element) throws ActionFailed {
        try {
            String oldHandle = driver.getWindowHandle();
            Actions action = new Actions(driver);
            Util.click(driver, element);
            action.moveByOffset(-MOUSE_MOVE_OFFSET, -MOUSE_MOVE_OFFSET).perform();
            action.moveToElement(element).perform();
            action.moveByOffset(2, 2).perform();
            Util.driverSleepShort();
            Util.cleanUpNewWindows(driver, oldHandle);
        } catch (Throwable t) {
            Log.exception(t);
            throw new ActionFailed(t);
        }
    }

    private static List<WebElement> navigateToSearch(RemoteWebDriver driver, String tagName, String type,
            boolean strict) throws ActionFailed {
        try {
            List<WebElement> tags = driver.findElementsByTagName(tagName);
            boolean success = false;
            for (WebElement tag : tags) {
                try {
                    if (type == null || type.equalsIgnoreCase(tag.getAttribute("type"))) {
                        String info = new String(tag.getAttribute("href") + " " + tag.getText());
                        if (searchControl.matcher(info).find()) {
                            hoverClick(driver, tag);
                            success = true;
                            break;
                        }
                    }
                } catch (Exception e) {
                    Log.exception(e);
                }
            }
            if (!success) {
                for (WebElement tag : tags) {
                    try {
                        if (type == null || type.equalsIgnoreCase(tag.getAttribute("type"))) {
                            String extendedInfo = new String(
                                    tag.getAttribute("name") + " " + tag.getAttribute("title") + " "
                                            + tag.getAttribute("class") + " " + tag.getAttribute("id"));
                            if (searchControl.matcher(extendedInfo).find()) {
                                hoverClick(driver, tag);
                                success = true;
                                break;
                            }
                        }
                    } catch (Exception e) {
                        Log.exception(e);
                    }
                }
            }
            if (success) {
                return findSearchBox(driver, strict);
            }
            return new ArrayList<WebElement>();
        } catch (Throwable t) {
            Log.exception(t);
            throw new ActionFailed(t);
        }
    }

    private static String doSearch(RemoteWebDriver driver, List<WebElement> searchBoxes, String searchQuery)
            throws ActionFailed {
        try {
            for (WebElement element : searchBoxes) {
                try {
                    Util.click(driver, element);
                    element.clear();
                    Util.driverSleepVeryShort();
                    if (!CommonUtil.isEmpty(element.getAttribute("value"))) {
                        element.sendKeys(delete);
                        Util.driverSleepVeryShort();
                    }
                    element.sendKeys(searchQuery);
                    Util.driverSleepVeryShort();
                    String beforeSource = driver.getPageSource();
                    String beforeTitle = driver.getTitle();
                    String beforeUrl = driver.getCurrentUrl();
                    String windowHandle = driver.getWindowHandle();
                    element.sendKeys("\n");
                    Util.driverSleepLong();
                    Util.cleanUpNewWindows(driver, windowHandle);
                    String afterSource = driver.getPageSource();
                    String afterTitle = driver.getTitle();
                    String afterUrl = driver.getCurrentUrl();
                    if (!beforeTitle.equals(afterTitle) || !beforeUrl.equals(afterUrl)
                            || Math.abs(beforeSource.length() - afterSource.length()) > MIN_SOURCE_DIFF) {
                        handleIframe(driver);
                        return driver.getPageSource();
                    }
                } catch (Throwable t) {
                    Log.exception(t);
                }
            }
        } catch (Throwable t) {
            Log.exception(t);
        }
        throw new ActionFailed();
    }

    private static void handleIframe(RemoteWebDriver driver) throws ActionFailed {
        List<WebElement> iframes = null;
        try {
            iframes = driver.findElementsByTagName("iframe");
        } catch (Throwable t) {
            throw new ActionFailed(t);
        }
        try {
            for (WebElement iframe : iframes) {
                try {
                    Dimension dim = iframe.getSize();
                    if (iframe.isDisplayed() && (dim.getHeight() * dim.getWidth()) > MIN_IFRAME_AREA) {
                        String src = iframe.getAttribute("src");
                        if (!CommonUtil.isEmpty(src) && src.indexOf("://") > -1 && src.indexOf("?") > -1) {
                            String origHandle = null;
                            String origUrl = null;
                            String newHandle = null;
                            try {
                                origHandle = driver.getWindowHandle();
                                origUrl = driver.getCurrentUrl();
                                newHandle = Util.newWindow(driver);
                            } catch (Throwable t) {
                                Log.exception(t);
                                throw new ActionFailed(t);
                            }
                            boolean undo = false;
                            try {
                                Util.get(driver, src, true);
                                driver.executeScript(
                                        "document.getElementsByTagName('html')[0].style.overflow='scroll';");
                                Util.driverSleepShort();
                                if (driver.findElementByTagName("body").getText().length() < MIN_SOURCE_DIFF) {
                                    undo = true;
                                }
                            } catch (Throwable t) {
                                Log.exception(t);
                                undo = true;
                            }
                            try {
                                if (undo) {
                                    if (origHandle.equals(newHandle)) {
                                        if (!driver.getCurrentUrl().equals(origUrl)) {
                                            try {
                                                driver.navigate().back();
                                            } catch (Throwable t) {
                                                Log.exception(t);
                                            }
                                        }
                                        if (!driver.getCurrentUrl().equals(origUrl)) {
                                            driver.get(origUrl);
                                        }
                                    } else {
                                        Util.cleanUpNewWindows(driver, origHandle);
                                    }
                                } else {
                                    Util.cleanUpNewWindows(driver, newHandle);
                                    break;
                                }
                            } catch (Throwable t) {
                                Log.exception(t);
                                throw new ActionFailed(t);
                            }
                        }
                    }
                } catch (Throwable t) {
                    Log.exception(t);
                    continue;
                }
            }
        } catch (Throwable t) {
            Log.exception(t);
            throw new ActionFailed(t);
        }
    }

    public static void perform(RemoteWebDriver driver, KeywordQuery context) throws ActionFailed {
        try {
            Util.get(driver, context.site, true);
            Util.doClicks(driver, context.preAuthClicks, null);
            QueryCommon.doAuth(driver, context.credentials);
            Util.doClicks(driver, context.preSearchClicks, null);
            List<WebElement> searchBoxes = findSearchBox(driver, true);
            String searchResult = doSearch(driver, searchBoxes, context.keywords);
            String[] fallbackNames = new String[] { "button", "input", "input", "div", "label", "span", "li", "ul",
                    "a" };
            String[] fallbackTypes = new String[] { null, "button", "submit", null, null, null, null, null, null };
            for (int i = 0; i < fallbackNames.length && searchResult == null; i++) {
                searchBoxes = navigateToSearch(driver, fallbackNames[i], fallbackTypes[i], true);
                searchResult = doSearch(driver, searchBoxes, context.keywords);
            }
            if (searchResult == null) {
                searchBoxes = findSearchBox(driver, false);
                searchResult = doSearch(driver, searchBoxes, context.keywords);
            }
            if (searchResult == null) {
                for (int i = 0; i < fallbackNames.length && searchResult == null; i++) {
                    searchBoxes = navigateToSearch(driver, fallbackNames[i], fallbackTypes[i], false);
                    searchResult = doSearch(driver, searchBoxes, context.keywords);
                }
            }
            Util.doClicks(driver, context.postSearchClicks, null);
        } catch (Throwable t) {
            Log.exception(t);
            throw new ActionFailed(t);
        }
    }
}