com.screenslicer.core.scrape.QueryForm.java Source code

Java tutorial

Introduction

Here is the source code for com.screenslicer.core.scrape.QueryForm.java

Source

/* 
 * ScreenSlicer (TM) -- automatic, zero-config web scraping (TM)
 * Copyright (C) 2013-2014 Machine Publishers, LLC
 * ops@machinepublishers.com | screenslicer.com | machinepublishers.com
 * 717 Martin Luther King Dr W Ste I, Cincinnati, Ohio 45220
 *
 * You can redistribute this program and/or modify it under the terms of the
 * GNU Affero General Public License version 3 as published by the Free
 * Software Foundation. Additional permissions or commercial licensing may be
 * available--see LICENSE file or contact Machine Publishers, LLC for details.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License version 3
 * for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * version 3 along with this program. If not, see <http://www.gnu.org/licenses/>.
 * 
 * For general details about how to investigate and report license violations,
 * please see: https://www.gnu.org/licenses/gpl-violation.html
 * and email the author: ops@machinepublishers.com
 * Keep in mind that paying customers have more rights than the AGPL alone offers.
 */
package com.screenslicer.core.scrape;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.jsoup.nodes.Element;
import org.openqa.selenium.By;
import org.openqa.selenium.Keys;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.remote.RemoteWebDriver;
import org.openqa.selenium.support.ui.Select;

import com.screenslicer.api.datatype.HtmlNode;
import com.screenslicer.api.request.FormLoad;
import com.screenslicer.api.request.FormQuery;
import com.screenslicer.common.CommonUtil;
import com.screenslicer.common.Crypto;
import com.screenslicer.common.Log;
import com.screenslicer.core.scrape.Scrape.ActionFailed;
import com.screenslicer.core.util.Util;
import com.screenslicer.webapp.WebApp;

import edu.emory.mathcs.backport.java.util.Collections;

public class QueryForm {
    private static final double NAMED_CONTROLS_MIN_RATIO = .75;
    private static final int CHARS_TO_REMOVE = 60;
    static String delete;
    static {
        String key = Keys.BACK_SPACE.toString();
        delete = key;
        for (int i = 1; i < CHARS_TO_REMOVE; i++) {
            delete = delete + key;
        }
        key = Keys.DELETE.toString();
        delete = delete + key;
        for (int i = 1; i < CHARS_TO_REMOVE; i++) {
            delete = delete + key;
        }
    }

    private static void doSubmit(RemoteWebDriver driver, String formId) throws ActionFailed {
        try {
            List<WebElement> inputs = driver.findElementById(formId).findElements(By.tagName("input"));
            List<WebElement> buttons = driver.findElementById(formId).findElements(By.tagName("button"));
            List<WebElement> possibleSubmits = new ArrayList<WebElement>();
            List<WebElement> submits = new ArrayList<WebElement>();
            possibleSubmits.addAll(inputs);
            possibleSubmits.addAll(buttons);
            for (WebElement possibleSubmit : possibleSubmits) {
                if ("submit".equalsIgnoreCase(possibleSubmit.getAttribute("type"))) {
                    submits.add(possibleSubmit);
                }
            }
            boolean clicked = false;
            try {
                if (!submits.isEmpty()) {
                    if (submits.size() == 1) {
                        clicked = Util.click(driver, submits.get(0));
                    } else {
                        String formHtml = CommonUtil.strip(driver.findElementById(formId).getAttribute("outerHTML"),
                                false);
                        int minIndex = Integer.MAX_VALUE;
                        WebElement firstSubmit = null;
                        for (WebElement submit : submits) {
                            try {
                                String submitHtml = CommonUtil.strip(submit.getAttribute("outerHTML"), false);
                                int submitIndex = formHtml.indexOf(submitHtml);
                                if (submitIndex < minIndex) {
                                    minIndex = submitIndex;
                                    firstSubmit = submit;
                                }
                            } catch (Throwable t) {
                                Log.exception(t);
                            }
                        }
                        if (firstSubmit != null) {
                            clicked = Util.click(driver, firstSubmit);
                        }
                    }
                }
            } catch (Throwable t) {
                Log.exception(t);
            }
            if (!clicked) {
                driver.findElementById(formId).submit();
            }
            Util.driverSleepLong();
        } catch (Throwable t) {
            Log.exception(t);
            throw new ActionFailed(t);
        }
    }

    public static void perform(RemoteWebDriver driver, FormQuery context) throws ActionFailed {
        try {
            Util.get(driver, context.site, true);
            Util.doClicks(driver, context.preAuthClicks, null);
            QueryCommon.doAuth(driver, context.credentials);
            Util.doClicks(driver, context.preSearchClicks, null);
            Map<String, HtmlNode> formControls = new HashMap<String, HtmlNode>();
            for (int i = 0; i < context.formSchema.length; i++) {
                formControls.put(context.formSchema[i].guid, context.formSchema[i]);
            }
            Map<String, List<String>> formData = context.formModel;
            boolean valueChanged = false;
            int count = 0;
            final int MAX_TRIES = 3;
            Element body = Util.openElement(driver, null, null, null);
            do {
                ++count;
                valueChanged = false;
                for (Map.Entry<String, List<String>> entry : formData.entrySet()) {
                    try {
                        HtmlNode formControl = formControls.get(entry.getKey());
                        if (!CommonUtil.isEmpty(entry.getValue())) {
                            if ("select".equalsIgnoreCase(formControl.tagName)) {
                                if (WebApp.DEBUG) {
                                    System.out.println("Query Form: select");
                                }
                                Select select = new Select(Util.toElement(driver, formControl, body));
                                if (select.isMultiple()) {
                                    select.deselectAll();
                                }
                                List<WebElement> selectedElements = select.getAllSelectedOptions();
                                List<String> selectedStrings = new ArrayList<String>();
                                for (WebElement selectedElement : selectedElements) {
                                    String selectedString = selectedElement.getAttribute("value");
                                    if (!CommonUtil.isEmpty(selectedString)) {
                                        selectedStrings.add(selectedString);
                                    }
                                }
                                boolean matches = true;
                                for (String selectedString : selectedStrings) {
                                    if (!entry.getValue().contains(selectedString)) {
                                        matches = false;
                                        break;
                                    }
                                }
                                if (!matches || selectedStrings.size() != entry.getValue().size()) {
                                    for (String val : entry.getValue()) {
                                        valueChanged = true;
                                        select.selectByValue(val);
                                        Util.driverSleepVeryShort();
                                    }
                                }
                            } else if ("input".equalsIgnoreCase(formControl.tagName)
                                    && ("text".equalsIgnoreCase(formControl.type)
                                            || "search".equalsIgnoreCase(formControl.type))) {
                                if (WebApp.DEBUG) {
                                    System.out.println("Query Form: input[text|search]");
                                }
                                WebElement element = Util.toElement(driver, formControl, body);
                                valueChanged = QueryCommon.typeText(driver, element, entry.getValue().get(0), true,
                                        false);
                            } else if ("input".equalsIgnoreCase(formControl.tagName)
                                    && ("checkbox".equalsIgnoreCase(formControl.type)
                                            || "radio".equalsIgnoreCase(formControl.type))) {
                                if (WebApp.DEBUG) {
                                    System.out.println("Query Form: input[checkbox|radio]");
                                }
                                WebElement element = Util.toElement(driver, formControl, body);
                                if (entry.getValue() != null && !entry.getValue().isEmpty()) {
                                    if ("radio".equalsIgnoreCase(formControl.type)) {
                                        String elementVal = element.getAttribute("value");
                                        String schemaVal = formControl.value;
                                        String modelVal = entry.getValue().get(0);
                                        if (elementVal != null && schemaVal != null
                                                && elementVal.equalsIgnoreCase(schemaVal)
                                                && modelVal.equalsIgnoreCase(schemaVal)) {
                                            if (!element.isSelected()) {
                                                if (WebApp.DEBUG) {
                                                    System.out.println("Clicking radio button");
                                                }
                                                valueChanged = Util.click(driver, element);
                                            }
                                        }
                                    } else if (!element.isSelected()) {
                                        if (WebApp.DEBUG) {
                                            System.out.println("Clicking [checkbox|radio]");
                                        }
                                        valueChanged = Util.click(driver, element);
                                    }
                                } else {
                                    if (element.isSelected()) {
                                        if (WebApp.DEBUG) {
                                            System.out.println("Deselecting [checkbox|radio]");
                                        }
                                        valueChanged = true;
                                        element.clear();
                                        Util.driverSleepVeryShort();
                                    }
                                }
                            }
                        }
                    } catch (Throwable t) {
                        Log.exception(t);
                    }
                }
            } while (valueChanged && count < MAX_TRIES);
            doSubmit(driver, context.formId);
            Util.doClicks(driver, context.postSearchClicks, null);
        } catch (Throwable t) {
            Log.exception(t);
            throw new ActionFailed(t);
        }
    }

    public static List<HtmlNode> load(RemoteWebDriver driver, FormLoad context) throws ActionFailed {
        try {
            Util.get(driver, context.site, true);
            Util.doClicks(driver, context.preAuthClicks, null);
            QueryCommon.doAuth(driver, context.credentials);
            Util.doClicks(driver, context.preSearchClicks, null);
            WebElement form = driver.findElementById(context.formId);
            Map<HtmlNode, String> controlsHtml = new HashMap<HtmlNode, String>();
            String formHtml = CommonUtil.strip(form.getAttribute("outerHTML"), false);
            List<WebElement> elements = new ArrayList<WebElement>();
            elements.addAll(form.findElements(By.tagName("input")));
            elements.addAll(form.findElements(By.tagName("button")));
            elements.addAll(form.findElements(By.tagName("textarea")));
            List<HtmlNode> controls = new ArrayList<HtmlNode>();
            for (WebElement element : elements) {
                HtmlNode control = toFormControl(element);
                if (control != null) {
                    controls.add(control);
                    controlsHtml.put(control, CommonUtil.strip(element.getAttribute("outerHTML"), false));
                }
            }
            elements = new ArrayList<WebElement>();
            elements.addAll(form.findElements(By.tagName("select")));
            for (WebElement element : elements) {
                HtmlNode control = toFormControl(element);
                if (control != null) {
                    control.innerHtml = null;
                    List<WebElement> options = element.findElements(By.tagName("option"));
                    List<String> optionValues = new ArrayList<String>();
                    List<String> optionLabels = new ArrayList<String>();
                    for (WebElement option : options) {
                        String value = option.getAttribute("value");
                        String label = option.getAttribute("innerHTML");
                        if (!CommonUtil.isEmpty(value) && !CommonUtil.isEmpty(label)) {
                            optionValues.add(value);
                            optionLabels.add(label);
                        }
                    }
                    String multiple = element.getAttribute("multiple");
                    if (!CommonUtil.isEmpty(multiple) && !"false".equalsIgnoreCase(multiple)) {
                        control.multiple = "multiple";
                    } else {
                        multiple = element.getAttribute("data-multiple");
                        if (!CommonUtil.isEmpty(multiple) && !"false".equalsIgnoreCase(multiple)) {
                            control.multiple = "multiple";
                        }
                    }
                    if ("select-multiple".equalsIgnoreCase(control.type)) {
                        control.multiple = "multiple";
                    }
                    control.type = null;
                    control.optionValues = optionValues.toArray(new String[0]);
                    control.optionLabels = optionLabels.toArray(new String[0]);
                    controls.add(control);
                    controlsHtml.put(control, CommonUtil.strip(element.getAttribute("outerHTML"), false));
                }
            }
            loadLabels(driver, controls);
            loadGuids(controls);
            Collections.sort(controls, new ControlComparator(formHtml, controlsHtml));
            return filterControls(controls);
        } catch (Throwable t) {
            Log.exception(t);
            throw new ActionFailed(t);
        }
    }

    private static void loadGuids(List<HtmlNode> formControls) {
        for (HtmlNode formControl : formControls) {
            if (!CommonUtil.isEmpty(formControl.name)) {
                formControl.guidName = Crypto.fastHash(formControl.name);
            }
        }
    }

    private static class ControlComparator implements Comparator<HtmlNode> {
        private String formHtml;
        private Map<HtmlNode, String> controlsHtml;

        public ControlComparator(String formHtml, Map<HtmlNode, String> controlsHtml) {
            this.formHtml = formHtml;
            this.controlsHtml = controlsHtml;
        }

        @Override
        public int compare(HtmlNode lhs, HtmlNode rhs) {
            return Integer.compare(formHtml.indexOf(controlsHtml.get(lhs)),
                    formHtml.indexOf(controlsHtml.get(rhs)));
        }
    }

    private static List<HtmlNode> filterControls(List<HtmlNode> formControls) {
        int namedControls = 0;
        int totalControls = 0;
        List<HtmlNode> filtered = new ArrayList<HtmlNode>();
        for (HtmlNode formControl : formControls) {
            if (isControlNameNeeded(formControl)) {
                ++totalControls;
                if (!CommonUtil.isEmpty(formControl.name)) {
                    ++namedControls;
                    filtered.add(formControl);
                }
            } else {
                filtered.add(formControl);
            }
        }
        if ((double) namedControls / (double) totalControls > NAMED_CONTROLS_MIN_RATIO) {
            return filtered;
        }
        return formControls;
    }

    private static boolean isControlNameNeeded(HtmlNode control) {
        return !control.tagName.equalsIgnoreCase("button")
                && (!control.tagName.equalsIgnoreCase("input") || CommonUtil.isEmpty(control.type)
                        || !control.type.equalsIgnoreCase("button") || !control.type.equalsIgnoreCase("submit"));
    }

    private static void loadLabels(RemoteWebDriver driver, List<HtmlNode> controls) throws ActionFailed {
        try {
            List<WebElement> labels = driver.findElementsByTagName("label");
            Map<String, String> labelMap = new HashMap<String, String>();
            for (WebElement label : labels) {
                String labelFor = label.getAttribute("for");
                String labelText = label.getText();
                if (!CommonUtil.isEmpty(labelFor) && !CommonUtil.isEmpty(labelText)) {
                    labelMap.put(labelFor.trim(), labelText.trim());
                }
            }
            for (HtmlNode control : controls) {
                if (!CommonUtil.isEmpty(control.id)) {
                    control.label = labelMap.get(control.id);
                }
            }
        } catch (Throwable t) {
            Log.exception(t);
            throw new ActionFailed(t);
        }
    }

    private static HtmlNode toFormControl(WebElement element) {
        if (!element.isDisplayed()) {
            return null;
        }
        HtmlNode control = new HtmlNode();
        control.tagName = element.getTagName();
        String attr = element.getAttribute("name");
        control.name = CommonUtil.isEmpty(attr) ? null : attr;
        attr = element.getAttribute("title");
        control.title = CommonUtil.isEmpty(attr) ? null : attr;
        attr = element.getAttribute("id");
        control.id = CommonUtil.isEmpty(attr) ? null : attr;
        attr = element.getAttribute("type");
        control.type = CommonUtil.isEmpty(attr) ? null : attr;
        attr = element.getAttribute("value");
        control.value = CommonUtil.isEmpty(attr) ? null : attr;
        attr = element.getAttribute("innerHTML");
        control.innerHtml = CommonUtil.isEmpty(attr) ? null : attr;
        if ("hidden".equalsIgnoreCase(control.type)) {
            return null;
        }
        return control;
    }
}