eu.smartfp7.foursquare.AttendanceCrawler.java Source code

Java tutorial

Introduction

Here is the source code for eu.smartfp7.foursquare.AttendanceCrawler.java

Source

/**  
 * SMART FP7 - Search engine for MultimediA enviRonment generated contenT
 * Webpage: http://smartfp7.eu
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. 
 * 
 * The Original Code is Copyright (c) 2012-2014 the University of Glasgow
 * All Rights Reserved
 * 
 * Contributor(s):
 *  @author Romain Deveaud <romain.deveaud at glasgow.ac.uk>
 */

package eu.smartfp7.foursquare;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;
import java.util.TimeZone;

import org.apache.commons.lang.time.DateUtils;

import com.google.gson.JsonElement;
import com.google.gson.JsonParser;

import eu.smartfp7.foursquare.utils.Settings;
import eu.smartfp7.foursquare.utils.Utils;

/**
 * This class contains methods for crawling the hourly activity of pre-identified
 * trending venues.
 * The crawl can operate for several cities at the same time.
 */

public class AttendanceCrawler {

    /**
     * This function loads the Foursquare IDs of the venues for a given city.
     * 
     * @param A city.
     * @return A list of String representing the IDs of the training venues
     * @throws IOException
     */
    public static Collection<String> loadVenues(String city) throws IOException {
        Collection<String> venues = new ArrayList<String>();

        String ids_file = Settings.getInstance().getFolder() + city + File.separator + "venues.ids";

        BufferedReader city_file = new BufferedReader(new FileReader(ids_file));
        String line = null;
        while ((line = city_file.readLine()) != null)
            venues.add(line);

        System.out.println(venues.size() + " venues loaded for " + city);
        city_file.close();

        return venues;
    }

    /**
     * From GitHub issue: https://github.com/SmartSearch/Foursquare-Attendance-Crawler/issues/3
     * 
     * Venues can be deleted by/on Foursquare, resulting in errors when the crawler 
     * attempts to retrieve the hourly attendance. It also has bad consequences: the 
     * crawler tries to obtain the attendance over and over, draining the number of 
     * API calls, which also impacts the crawling of other venues and can lead to 
     * missing obervations.
     * 
     * This function removes a venue from the different files, so that it won't be
     * considered by the crawler anymore.
     */
    public static void removeVenue(String venue_id, String city) {
        /**
         * First part: we need to remove `venue_id` from the ids file. We use a temporary
         * file to do this.
         */
        String ids_file = Settings.getInstance().getFolder() + city + File.separator + "venues.ids";
        String tmp_ids_file = Settings.getInstance().getFolder() + city + File.separator + "venues.ids.tmp";

        try {
            BufferedReader reader = new BufferedReader(new FileReader(ids_file));
            BufferedWriter writer = new BufferedWriter(new FileWriter(tmp_ids_file));

            String line = null;

            while ((line = reader.readLine()) != null) {
                // Skipping `venue_id` when rewriting the file.
                String trimmedLine = line.trim();
                if (trimmedLine.equals(venue_id))
                    continue;

                writer.write(line + "\n");
            }

            reader.close();
            writer.close();
        } catch (IOException e) {
            e.printStackTrace();
        }

        // When we have finished rewriting, we rename the temporary file so that it
        // becomes the real one.
        new File(tmp_ids_file).renameTo(new File(ids_file));

        /** End of first part. */

        /**
         * Second part: we need to delete the files related to the venue that have
         * been created while crawling (i.e. .ts and .info).
         * Instead, we move them into a .deleted folder that can allow us to recover
         * from hypothetical errors.
         */

        new File(Settings.getInstance().getFolder() + city + File.separator + "attendances_crawl" + File.separator
                + venue_id + ".ts")
                        .renameTo(new File(Settings.getInstance().getFolder() + city + File.separator + ".deleted"
                                + File.separator + venue_id + ".ts"));

        new File(Settings.getInstance().getFolder() + city + File.separator + "foursquare_venues" + File.separator
                + venue_id + ".info")
                        .renameTo(new File(Settings.getInstance().getFolder() + city + File.separator + ".deleted"
                                + File.separator + venue_id + ".info"));

        /** End of second part. */
    }

    /**
     * We use the entire hour to do all the calls. This method calculates the
     * amount of time the program has to sleep in order to finish crawling
     * every venue before the end of the current hour.
     * It does not account for already crawled venues: sleep time decreases
     * as the hour progresses.
     * Crawling all venues takes thus approximately 40 minutes.
     * 
     */
    public static void intelligentWait(int total_venues, long current_time, long avg_time_spent_crawling) {
        try {
            double time = (DateUtils.truncate(new Date(current_time + 3600000), Calendar.HOUR).getTime()
                    - current_time) / (double) total_venues;
            if (Math.round(time) < avg_time_spent_crawling)
                avg_time_spent_crawling = 0;
            Thread.sleep(Math.round(time) - avg_time_spent_crawling);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }

    /**
     * Gets the JSON String containing all the information about a venue, given its ID.
     */
    public static String getFoursquareVenueById(String venue_id, String id, String secret)
            throws IOException, FoursquareAPIException {
        /** This parameter represents the date of the Foursquare API version that we use. 
         *  If you want to modify the source code, please see https://developer.foursquare.com/overview/versioning */
        String vParam = "20140801";

        String url = "https://api.foursquare.com/v2/venues/" + venue_id + "?client_id=" + id + "&client_secret="
                + secret + "&v=" + vParam;

        String json_response = Utils.makeAPICall(url);

        JsonElement parsed_line = new JsonParser().parse(json_response);

        if (parsed_line.getAsJsonObject().get("response").toString().equals("{}")) {
            /** error */
            throw new FoursquareAPIException(json_response);
        }

        return parsed_line.getAsJsonObject().get("response").getAsJsonObject().get("venue").toString();
    }

    public static String getFoursquareVenueById(String venue_id, String city) throws Exception {
        Map<String, String> credentials = Settings.getInstance().getCityCredentials(city);
        return getFoursquareVenueById(venue_id, credentials.get("client_id"), credentials.get("client_secret"));
    }

    /**
     * The main takes an undefined number of cities as arguments, then initializes
     * the specific crawling of all the trending venues of these cities.
     * The trending venues must have been previously identified using the `DownloadPages`
     * program.
     * 
     * Current valid cities are: london, amsterdam, goldcoast, sanfrancisco.
     * 
     */
    public static void main(String[] args) throws Exception {
        Settings settings = Settings.getInstance();
        String folder = settings.getFolder();

        // We keep info and error logs, so that we know what happened in case
        // of incoherence in the time series.
        Map<String, FileWriter> info_logs = new HashMap<String, FileWriter>();
        Map<String, FileWriter> error_logs = new HashMap<String, FileWriter>();

        // For each city we monitor, we store the venue IDs that we got from
        // a previous crawl.
        Map<String, Collection<String>> city_venues = new HashMap<String, Collection<String>>();

        // Contains the epoch time when the last API call has been made for each 
        // venue. Ensures that we get data only once each hour. 
        Map<String, Long> venue_last_call = new HashMap<String, Long>();

        // Contains the epoch time when we last checked if time series were broken
        // for each city.
        // We do these checks once every day before the batch forecasting begins.
        Map<String, Long> sanity_checks = new HashMap<String, Long>();

        // We also keep in memory the number of checkins for the last hour for
        // each venue.
        Map<String, Integer> venue_last_checkin = new HashMap<String, Integer>();

        Map<Long, Integer> APICallsCount = new HashMap<Long, Integer>();

        DateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");

        int total_venues = 0;
        long total_calls = 0;
        long time_spent_on_API = 0;

        for (String c : args) {
            settings.checkFileHierarchy(c);

            city_venues.put(c, loadVenues(c));
            total_venues += city_venues.get(c).size();

            info_logs.put(c,
                    new FileWriter(folder + c + File.separator + "log" + File.separator + "info.log", true));
            error_logs.put(c,
                    new FileWriter(folder + c + File.separator + "log" + File.separator + "error.log", true));

            Calendar cal = Calendar.getInstance();

            info_logs.get(c).write("[" + df.format(cal.getTime()) + "] Crawler initialization for " + c + ". "
                    + city_venues.get(c).size() + " venues loaded.\n");
            info_logs.get(c).flush();

            // If we interrupted the program for some reason, we can get back
            // the in-memory data.
            // Important: the program must not be interrupted for more than one
            // hour, or we will lose time series data.
            for (String venue_id : city_venues.get(c)) {
                String ts_file = folder + c + File.separator + "attendances_crawl" + File.separator + venue_id
                        + ".ts";

                if (new File(ts_file).exists()) {
                    BufferedReader buffer = new BufferedReader(new FileReader(ts_file));
                    String mem = null, line = null;
                    for (; (line = buffer.readLine()) != null; mem = line)
                        ;
                    buffer.close();

                    if (mem == null)
                        continue;

                    String[] tmp = mem.split(",");
                    venue_last_call.put(venue_id, df.parse(tmp[0]).getTime());
                    venue_last_checkin.put(venue_id, Integer.parseInt(tmp[3]));

                    VenueUtil.fixBrokenTimeSeriesVenue(new File(ts_file));
                } // if
            } // for

            sanity_checks.put(c, cal.getTimeInMillis());
        } // for

        if (total_venues > 5000) {
            System.out.println(
                    "Too much venues for a single API account (max 5000).\nPlease create a new Foursquare API account and use these credentials.\nExiting now.");
            return;
        }

        while (true) {

            for (String c : args) {
                // We create a FIFO queue and pop venue IDs one at a time.
                LinkedList<String> city_venues_buffer = new LinkedList<String>(city_venues.get(c));
                String venue_id = null;

                // Artificial wait to avoid processors looping at 100% of their capacity
                // when there is no more venues to crawl for the current hour.
                Thread.sleep(3000);

                while ((venue_id = city_venues_buffer.pollFirst()) != null) {
                    // We get the current time according to the city's time zone
                    Calendar cal = Calendar.getInstance();
                    cal.add(Calendar.MILLISECOND,
                            TimeZone.getTimeZone(settings.getCityTimezone(c)).getOffset(cal.getTime().getTime())
                                    - Calendar.getInstance().getTimeZone().getOffset(cal.getTime().getTime()));
                    //TimeZone.getTimeZone("Europe/London").getOffset(cal.getTime().getTime()));

                    long current_time = DateUtils.truncate(cal.getTime(), Calendar.HOUR).getTime();

                    // We query Foursquare only once per hour per venue.
                    if (venue_last_call.get(venue_id) != null
                            && current_time < venue_last_call.get(venue_id) + 3600000)
                        continue;

                    intelligentWait(total_venues, cal.getTime().getTime(),
                            (total_calls == 0 ? 0 : Math.round(time_spent_on_API / total_calls)));

                    Venue venue = null;

                    try {
                        long beforeCall = System.currentTimeMillis();
                        venue = new Venue(getFoursquareVenueById(venue_id, c));

                        // If there is no last call, this is the beginning of the time series
                        // for this venue. We get the number of people "here now" to initialize
                        // the series.
                        if (venue_last_call.get(venue_id) == null) {
                            /** TODO: by doing this, we keep a representation of the venue dating from the beginning
                             *       of the specific crawl. we might want to change this and update this file once
                             *      in a while.
                             */
                            FileWriter info = new FileWriter(folder + c + File.separator + "foursquare_venues"
                                    + File.separator + venue_id + ".info");
                            info.write(venue.getFoursquareJson());
                            info.close();

                            FileWriter out = new FileWriter(folder + c + File.separator + "attendances_crawl"
                                    + File.separator + venue_id + ".ts");
                            out.write("Date,here_now,hour_checkins,total_checkins\n");
                            out.write(df.format(current_time) + "," + venue.getHereNow() + "," + venue.getHereNow()
                                    + "," + venue.getCheckincount() + "\n");
                            out.close();
                        } else {
                            FileWriter out = new FileWriter(folder + c + File.separator + "attendances_crawl"
                                    + File.separator + venue_id + ".ts", true);
                            int checks = venue.getCheckincount() - venue_last_checkin.get(venue_id);
                            out.write(df.format(current_time) + "," + venue.getHereNow() + ","
                                    + Integer.toString(checks) + "," + venue.getCheckincount() + "\n");
                            out.close();
                        }

                        if (APICallsCount.get(current_time) == null)
                            APICallsCount.put(current_time, 1);
                        else
                            APICallsCount.put(current_time, APICallsCount.get(current_time) + 1);

                        total_calls++;

                        venue_last_call.put(venue_id, current_time);
                        venue_last_checkin.put(venue_id, venue.getCheckincount());

                        time_spent_on_API += System.currentTimeMillis() - beforeCall;
                    } catch (Exception e) {
                        // If something bad happens (crawler not available, IO error, ...), we put the
                        // venue_id in the FIFO queue so that it gets reevaluated later.
                        //e.printStackTrace();
                        error_logs.get(c)
                                .write("[" + df.format(cal.getTime().getTime()) + "] Error with venue " + venue_id
                                        + " (" + e.getMessage() + "). " + APICallsCount.get(current_time)
                                        + " API calls so far this hour, " + city_venues_buffer.size()
                                        + " venues remaining in the buffer.\n");
                        error_logs.get(c).flush();

                        System.out.println("[" + df.format(cal.getTime().getTime()) + "] " + c + " -- "
                                + APICallsCount.get(current_time) + " API calls // " + city_venues_buffer.size()
                                + " venues remaining " + " (" + e.getMessage() + ")");

                        if (e instanceof FoursquareAPIException)
                            if (((FoursquareAPIException) e).getHttp_code().equals("400")
                                    && ((FoursquareAPIException) e).getError_detail()
                                            .equals("Venue " + venue_id + " has been deleted")) {
                                city_venues.get(c).remove(venue_id);
                                removeVenue(venue_id, c);
                            } else
                                city_venues_buffer.add(venue_id);

                        continue;
                    }
                } // while

                // Every day between 0am and 2am, we repair all the broken time series (if there
                // is something to repair).
                Calendar cal = Calendar.getInstance();
                if (city_venues_buffer.peekFirst() == null
                        && (cal.getTimeInMillis() - sanity_checks.get(c)) >= 86400000
                        && cal.get(Calendar.HOUR_OF_DAY) < 2) {
                    VenueUtil.fixBrokenTimeSeriesCity(c, folder);
                    sanity_checks.put(c, cal.getTimeInMillis());
                    info_logs.get(c).write("[" + df.format(cal.getTime()) + "] Sanity check OK.\n");
                    info_logs.get(c).flush();
                }
            } // for
        } // while
    } // main
} // class