com.wsl.marketconsolescraper.logic.Scraper.java Source code

Java tutorial

Introduction

Here is the source code for com.wsl.marketconsolescraper.logic.Scraper.java

Source

/*
 * Copyright (C) 2010 WorkSmart Labs, Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.wsl.marketconsolescraper.logic;

import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Arrays;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlPasswordInput;
import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput;
import com.gargoylesoftware.htmlunit.html.HtmlTextInput;
import com.wsl.marketconsolescraper.ScraperConfiguration;
import com.wsl.marketconsolescraper.model.AndroidMarketListing;
import com.wsl.marketconsolescraper.model.ElementStarsToString;
import com.wsl.marketconsolescraper.model.ListingDataItemRetriever;

/**
 * Scrapes Google Market's developer console for info about applications.
 *
 * @author lance
 */
public class Scraper {
    /** How long to wait for any scheduled JavaScript. */
    private static final int WAIT_FOR_DELAYED_JAVASCRIPT = 10 * 1000;

    /** Where the submit button is located in the sign-in page. */
    private static final String SIGN_IN_PAGE_SUBMIT_INPUT_XPATH = "//input[@type=\"submit\"]";

    /** Where the password input is located in the sign-in page. */
    private static final String SIGN_IN_PAGE_PASSWORD_INPUT_XPATH = "//input[@name=\"Passwd\"]";

    /** Where the email input is located in the sign-in page. */
    private static final String SIGN_IN_PAGE_EMAIL_INPUT_XPATH = "//input[@name=\"Email\"]";

    /** Page to sign-in to in order to get to the Market Console. */
    private static final String SIGN_IN_PAGE = "https://www.google.com/accounts/ServiceLogin?service=androiddeveloper&passive=true&nui=1"
            + "&continue=http%3A%2F%2Fmarket.android.com%2Fpublish"
            + "&followup=http%3A%2F%2Fmarket.android.com%2Fpublish";

    /** Text to check for when sign-in fails. */
    private static final String SIGN_IN_PAGE_FAILED_TEXT = "The username or password you entered is incorrect.";

    /** Error message for when a required page could not be loaded. */
    private static final String COULD_NOT_LOAD_REQUIRED_PAGE_MESSAGE = "Could not load page: ";

    /** What information is extracted from each application, where it is, and how to process it. */
    private static final List<ListingDataItemRetriever> mappings = Arrays.asList(new ListingDataItemRetriever[] {
            new ListingDataItemRetriever("//div[@class='listingRow']//a[contains(@href, '#EDIT_APPLICATION?')]") {
                @Override
                public void setProperty(HtmlElement rawItem, AndroidMarketListing listing) {
                    if (rawItem == null) {
                        return;
                    }
                    listing.setApplication(rawItem.asText().trim());
                }
            }, new ListingDataItemRetriever("//div[@class='listingRow']//span[@example='1.0']") {
                @Override
                public void setProperty(HtmlElement rawItem, AndroidMarketListing listing) {
                    if (rawItem == null) {
                        return;
                    }
                    listing.setVersion(rawItem.asText().trim());
                }
            }, new ListingDataItemRetriever("//table[@class='listingRating']//span[@class='gwt-InlineLabel']") {
                @Override
                public void setProperty(HtmlElement rawItem, AndroidMarketListing listing) {
                    if (rawItem == null) {
                        return;
                    }
                    String text = rawItem.asText().replace('(', ' ').replace(')', ' ').trim();
                    listing.setNumberOfRatings(new Integer(text));
                }
            }, new ListingDataItemRetriever("//div[@class='listingRow']//span[@example='12,000']") {
                @Override
                public void setProperty(HtmlElement rawItem, AndroidMarketListing listing) {
                    if (rawItem == null) {
                        return;
                    }
                    String text = rawItem.asText().trim();
                    listing.setTotalInstalls(new Integer(text));
                }
            }, new ListingDataItemRetriever("//div[@class='listingRow']//span[@example='9,000']") {
                @Override
                public void setProperty(HtmlElement rawItem, AndroidMarketListing listing) {
                    if (rawItem == null) {
                        return;
                    }
                    String text = rawItem.asText().trim();
                    listing.setActiveInstalls(new Integer(text));
                }
            }, new ElementStarsToString("//table[@class='listingRating']") });

    /** Used for logging. */
    private static final Log LOG = LogFactory.getLog(Scraper.class);

    /**
     * Performs a scrape.
     * @return scraped application data
     */
    public Collection<AndroidMarketListing> scrape() {
        log("Scraper starting.");

        final HtmlPage signedInPage = getMarketPage();

        log("Market page title: " + signedInPage.getTitleText());

        String pageAsText = signedInPage.asText();
        log("Market page text: " + pageAsText);

        LinkedList<AndroidMarketListing> applicationInfo = new LinkedList<AndroidMarketListing>();

        for (ListingDataItemRetriever item : mappings) {
            ListIterator<AndroidMarketListing> appInfoIterator = applicationInfo.listIterator();
            item.retrieveAppInfoItems(signedInPage, appInfoIterator);
        }
        log("Application info map: " + applicationInfo);

        return applicationInfo;
    }

    /** Logs a message.
     * @param message String to log
     */
    private static void log(String message) {
        LOG.debug(message);
    }

    /**
     * Gets a page that is required to continue scraping.
     * @param webClient WebClient to get it with
     * @param url String location of the page
     * @return specified page
     * @throws RuntimeException if the page couldn't be retrieved
     */
    protected static HtmlPage getRequiredPage(WebClient webClient, String url) {
        HtmlPage signedInPage;
        try {
            signedInPage = webClient.getPage(url);
        } catch (FailingHttpStatusCodeException e) {
            throw new RuntimeException(COULD_NOT_LOAD_REQUIRED_PAGE_MESSAGE + url, e);
        } catch (MalformedURLException e) {
            throw new RuntimeException(COULD_NOT_LOAD_REQUIRED_PAGE_MESSAGE + url, e);
        } catch (IOException e) {
            throw new RuntimeException(COULD_NOT_LOAD_REQUIRED_PAGE_MESSAGE + url, e);
        }
        return signedInPage;
    }

    /**
     * Gets the current Market page from the web.
     * @return market page
     */
    protected HtmlPage getMarketPage() {
        // Get sign-in page.
        final WebClient webClient = new WebClient();
        try {
            webClient.setAjaxController(new NicelyResynchronizingAjaxController());
            webClient.setThrowExceptionOnScriptError(true);

            HtmlPage signInPage = getRequiredPage(webClient, SIGN_IN_PAGE);
            webClient.waitForBackgroundJavaScriptStartingBefore(WAIT_FOR_DELAYED_JAVASCRIPT);
            log("Retrieved sign-in page, title: " + signInPage.getTitleText());

            ScraperConfiguration config = new ScraperConfiguration();

            // Fill in login form.
            HtmlTextInput emailInput = (HtmlTextInput) signInPage.getFirstByXPath(SIGN_IN_PAGE_EMAIL_INPUT_XPATH);

            String marketUsername = config.getMarketUsername();
            System.out.println("Scraper using username: " + marketUsername);
            emailInput.setValueAttribute(marketUsername);
            HtmlPasswordInput passwordInput = (HtmlPasswordInput) signInPage
                    .getFirstByXPath(SIGN_IN_PAGE_PASSWORD_INPUT_XPATH);

            String marketPassword = config.getMarketPassword();
            passwordInput.setValueAttribute(marketPassword);

            // Submit login form.
            HtmlSubmitInput signInInput = (HtmlSubmitInput) signInPage
                    .getFirstByXPath(SIGN_IN_PAGE_SUBMIT_INPUT_XPATH);
            HtmlPage signedInPage;
            try {
                signedInPage = signInInput.click();
            } catch (IOException e) {
                throw new RuntimeException("Error submitting sign-in page.", e);
            }
            webClient.waitForBackgroundJavaScriptStartingBefore(WAIT_FOR_DELAYED_JAVASCRIPT);

            log("Page title after sign-in attempt: " + signedInPage.getTitleText());
            String pageAsText = signedInPage.asText();
            System.out.println("Signed in page text: " + pageAsText);

            boolean signInFailed = pageAsText.contains(SIGN_IN_PAGE_FAILED_TEXT);
            if (signInFailed) {
                throw new RuntimeException("sign-in failed. " + SIGN_IN_PAGE_FAILED_TEXT);
            }
            return signedInPage;
        } finally {
            webClient.closeAllWindows();
        }
    }
}