com.evidon.areweprivateyet.Crawler.java Source code

Java tutorial

Introduction

Here is the source code for com.evidon.areweprivateyet.Crawler.java

Source

// Copyright 2013 Evidon.  All rights reserved.
// Use of this source code is governed by a Apache License 2.0
// license that can be found in the LICENSE file.

package com.evidon.areweprivateyet;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.concurrent.TimeUnit;

import org.apache.commons.io.FileUtils;
import org.openqa.selenium.TimeoutException;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.firefox.FirefoxProfile;
import org.openqa.selenium.firefox.internal.ProfilesIni;

public class Crawler {
    // VM prop -Dawby_path=C:/Users/fixanoid-work/Desktop/areweprivateyet/bin/
    String path = System.getProperty("awby_path");
    ArrayList<String> urls = new ArrayList<String>();
    StringBuilder out = new StringBuilder();

    private void recordLog(String name) throws IOException {
        BufferedWriter out = new BufferedWriter(new FileWriter(path + "crawl-" + name + ".log"));
        out.write(out.toString());
        out.close();
    }

    private void loadSiteList() throws Exception {
        BufferedReader in = new BufferedReader(new FileReader(path + "top500.list"));
        String line = in.readLine();
        while (line != null) {
            urls.add(line);
            line = in.readLine();
        }
        in.close();
    }

    private String getDriverProfile() {
        // C:\Users\ADMINI~1\AppData\Local\Temp\2\
        /**
         * The temporary profile Selenium/Firefox creates should be 
         * located at java.io.tmpdir. Please check if its the same
         * on other OSes since I've used winblows.
         */
        File sysTemp = new File(System.getProperty("java.io.tmpdir"));
        File pd = null;
        long prevTime = 0;

        for (File t : sysTemp.listFiles()) {
            if (!t.isDirectory()) {
                continue;
            }

            if (t.toString().contains("webdriver-profile")) {
                if (prevTime == 0) {
                    prevTime = System.currentTimeMillis() - t.lastModified();
                }

                if (prevTime >= System.currentTimeMillis() - t.lastModified()) {
                    pd = t;
                    prevTime = System.currentTimeMillis() - t.lastModified();
                }
            }
        }

        return pd.toString();
    }

    private void log(String s) {
        out.append(s + "\n");
        System.out.println(s);
    }

    private void handleTimeout(String baseWindow, String url, WebDriver driver) {
        log("\tTimed out loading " + url + ", skipping.");
        killPopups(baseWindow, driver);
    }

    private void killPopups(String baseWindow, WebDriver driver) {
        // close any new popups.
        for (String handle : driver.getWindowHandles()) {
            if (!handle.equals(baseWindow)) {
                WebDriver popup = driver.switchTo().window(handle);
                log("\tClosing popup: " + popup.getCurrentUrl());
                popup.close();

                // TODO: need to see if this breaks when there is a modal.
            }
        }

        driver.switchTo().window(baseWindow);
    }

    public Crawler(String namedProfile) throws Exception {
        loadSiteList();

        int sleepTime = (namedProfile.equals("baseline") ? 10 : 5);
        boolean started = false;
        String baseWindow = "";

        FirefoxProfile profile = new ProfilesIni().getProfile(namedProfile);
        //profile.setPreference("webdriver.load.strategy", "fast");

        WebDriver driver = new FirefoxDriver(profile);

        driver.manage().timeouts().implicitlyWait(40, TimeUnit.SECONDS);
        driver.manage().timeouts().pageLoadTimeout(40, TimeUnit.SECONDS);
        driver.manage().timeouts().setScriptTimeout(40, TimeUnit.SECONDS);

        // figure out where the fucking profile is. wow!
        String profileDir = getDriverProfile();

        log("Crawling started for " + namedProfile);

        int count = 0;
        for (String url : urls) {
            if (!started) {
                // Original window handle to be used as base. Used so we can close all other popups.  
                baseWindow = driver.getWindowHandle();
                started = true;
            }

            count++;
            log("\t" + count + ". navigating to: " + url);

            CrawlusInterruptus ci = new CrawlusInterruptus(60);
            try {
                ci.start();

                try {
                    // Confirm handling for one of those super fucking annoying "Are you sure you wonna go anywhere else?"
                    driver.switchTo().alert().accept();
                    log("\tAccepted a navigate away modal");
                } catch (Exception e) {
                }

                driver.get("http://" + url);

                // WTF, why would their own fucking wait not work?!?
                // new WebDriverWait(driver, 5 * 1000);
            } catch (TimeoutException te) {
                handleTimeout(baseWindow, url, driver);
            } catch (org.openqa.selenium.UnhandledAlertException me) {
                log("\tModal exception caused by previous site?");

                // Retry current site.
                try {
                    driver.get("http://" + url);
                } catch (TimeoutException te) {
                    handleTimeout(baseWindow, url, driver);
                }
            } finally {
                ci.interrupt();
            }

            try {
                Thread.sleep(sleepTime * 1000);
            } catch (InterruptedException e) {
            }

            killPopups(baseWindow, driver);
        }

        // 4th party does not know when the crawl is over, so we send a trip signal by navigating to the "last" domain
        try {
            driver.get("http://www.josesignanini.com");
        } catch (TimeoutException te) {
        }
        try {
            Thread.sleep(60 * 1000);
        } catch (InterruptedException e) {
        }

        // copy the fourthparty database out.
        FileUtils.copyFile(new File(profileDir + "/fourthparty.sqlite"),
                new File(path + "/fourthparty-" + namedProfile + ".sqlite"));

        driver.quit();
        log("Crawling completed for " + namedProfile);

        recordLog(namedProfile);
    }

    public static void main(String args[]) {
        /*
        FourthParty implemented:
            - cookies
            - requests
            - redirects
            - local storage
            - amount of data transfer
         TODO:
            - flash cookies
         */
        try {
            String[] profiles = { "baseline", "ghostery", "dntme", "disconnect", "abp-fanboy", "abp-easylist",
                    "trackerblock", /*"requestpolicy", "noscript",*/ "cookies-blocked" };
            for (String profile : profiles) {
                new Crawler(profile);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}