org.karndo.piracy.Scraper.java Source code

Java tutorial

Introduction

Here is the source code for org.karndo.piracy.Scraper.java

Source

/*
   Copyright 2012, Shane Boulden
    
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
    
   http://www.apache.org/licenses/LICENSE-2.0
    
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
 */
package org.karndo.piracy;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.BufferedReader;
import java.text.ParseException;
import java.util.LinkedList;
import java.util.Date;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.*;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.commons.io.IOUtils.*;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.DateUtils;
import org.openide.windows.InputOutput;

/**
 * A class for scraping data from the IMB live piracy map. Data from the site is
 * obtained from a HTML files; the class parses this data into PiracyEvent
 * objects for display/manipulation on the map.
 * @author Shane Boulden
 * @version June 2012
 */
public class Scraper {

    /**
     * 
     * @param url
     * @return 
     */
    private String get_piracy_data(String url) throws IOException {
        InputStream instream = null;
        BufferedReader reader = null;

        //Some Apache HTTP objects for accessing the website
        HttpClient httpclient = new DefaultHttpClient();
        HttpGet httpget = new HttpGet(url);
        HttpResponse response = httpclient.execute(httpget);
        HttpEntity entity = response.getEntity();

        if (entity != null) {
            instream = entity.getContent();
        }

        //read characters from the instream
        reader = new BufferedReader(new InputStreamReader(instream));
        String line = "";
        String data = "";
        int count = 0;
        while ((line = reader.readLine()) != null) {
            //find the right line
            if (line.contains(
                    "http://www.icc-ccs.org/plugins/fabrik_visuali" + "zation/googlemap/googlemap-min.js")) {
                //TODO discover why it doesn't read another line without
                //the reader skipping
                reader.skip(1);
                data = (reader.readLine());
            }
        }

        instream.close();
        return data;

    }

    /**
     * 
     * @param url
     * @param io
     * @return 
     */
    public LinkedList<PiracyEvent> parse_piracy_data(String url, InputOutput io) throws IOException {

        //use the private method in this class to get the data
        String data = get_piracy_data(url);
        System.out.println(data);

        //strip everything before "icons"
        String temp = StringUtils.strip(data,
                "head.ready(function() {fabrikMap45 = new FbGoogleMapViz('table_map', {\"icons\":[");
        temp = "\"" + temp;

        //after stripping the first section, the data can be split using the
        //'curly brackets'
        String[] events = StringUtils.split(temp, "{");
        LinkedList<PiracyEvent> events_list = new LinkedList<PiracyEvent>();

        //some parameters for holding data from the event strings 
        PiracyEvent event1 = null;
        double longitude = 0.0;
        double latitude = 0.0;
        String attack_id = "";
        String vessel_type = "";
        String status = "";
        Date date = null;

        for (String str : events) {
            try {

                //Strip out the latitude and longitude
                String lat1 = StringUtils.strip(StringUtils.substringBetween(str, "\"0\":", ","), "\"");
                String long1 = StringUtils.strip(StringUtils.substringBetween(str, "\"1\":", ","), "\"");
                //parse the values into doubles
                latitude = Double.parseDouble(lat1);
                longitude = Double.parseDouble(long1);
                //strip out the attack id.
                attack_id = StringUtils.strip(StringUtils.substringBetween(str, "\"2\":", "<br \\/>"),
                        "\"Attack ID:");
                //strip out the date
                String date_str = StringUtils.strip(StringUtils.substringBetween(str, "Date:", "<br"), "Date:");
                // TODO change this to a GMT time-format
                date = DateUtils.parseDate(StringUtils.trim(date_str), "yyyy-MM-dd");
                //strip out the Vessel type
                vessel_type = StringUtils.strip(StringUtils.substringBetween(str, "Vessel:", "<br \\/>"),
                        "Vessel:");
                //strip out the status
                status = StringUtils.strip(StringUtils.substringBetween(str, "Status:", "<br \\/>"), "Status:");

                //create a piracy event
                event1 = new PiracyEvent(latitude, longitude, attack_id, date, StringUtils.trim(status),
                        StringUtils.trim(vessel_type));
                events_list.add(event1);

                //print to the supplied InputOutput from the main window
                io.getOut().println(event1);
            } catch (ParseException ex) {
                System.err.println("A parse exception occurred parsing the date");
                System.err.println(ex.getMessage());
            } finally {

            }
        }

        return events_list;
    }
}