com.reddit.util.ApiUtil.java Source code

Java tutorial

Introduction

Here is the source code for com.reddit.util.ApiUtil.java

Source

/*
Simple Reddit Scraper
Copyright (C) 2012-2013, Gary Paduana, gary.paduana@gmail.com
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
    
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
    
You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

package com.reddit.util;

import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;
import java.util.Timer;
import java.util.TimerTask;

import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;

import com.reddit.RedditScraperBot;
import com.reddit.domain.Comment;
import com.reddit.domain.CommentThread;

public class ApiUtil {

    private Timer timer = new Timer();
    private boolean delayedSufficiently = false;
    private long totalRequests = 0;
    private long start;
    Set<String> processedCommentIds = new HashSet<String>();
    private String modHash = null;
    private String cookie = null;

    /**
     * 
     * @param delay - time in milliseconds to wait between requests
     */
    public ApiUtil(long delay) {
        timer.schedule(new TimerTask() {

            @Override
            public void run() {
                delayedSufficiently = true;
            }

        }, new Date(), delay);
        start = System.currentTimeMillis();
    }

    /**
     * Blocks the caller until it is acceptable to make a request based on the initial
     * construction of this class.
     * 
     * @param urlString
     * @throws InterruptedException
     * @throws MalformedURLException
     * @throws IOException
     * @throws JSONException
     */
    private void possibleDelay(String urlString)
            throws InterruptedException, MalformedURLException, IOException, JSONException {
        while (delayedSufficiently == false) {
            // Requests to reddit do not happen exactly every 3000ms.  
            // Keep trying a new request every 1 second until acceptable.
            Thread.sleep(1000);
        }
        totalRequests++;
        System.out.println("\r\n## Request: " + totalRequests + ", Elapsed time: "
                + displayTime(System.currentTimeMillis() - start) + ", Currently fetching " + urlString);
        delayedSufficiently = false;
    }

    /**
     * Prints milliseconds as hours:minutes:seconds.milliseconds
     * @param ms
     * @return
     */
    public static String displayTime(long ms) {
        long hr = ms / (3600 * 1000);
        ms = ms - (hr * 3600 * 1000);

        long min = ms / (60 * 1000);
        ms = ms - (min * 60 * 1000);

        long sec = ms / 1000;
        ms = ms - (sec * 1000);

        String mss = Long.toString(ms);
        while (mss.length() < 3) {
            mss = "0" + mss;
        }

        return (hr < 10 ? ("0" + Long.toString(hr)) : hr) + ":" + (min < 10 ? ("0" + Long.toString(min)) : min)
                + ":" + (sec < 10 ? ("0" + Long.toString(sec)) : sec) + "." + mss;
    }

    /**
     * Retrieves the raw output from a url.
     * 
     * @param urlString
     * @return
     * @throws JSONException
     * @throws IOException
     * @throws InterruptedException
     */
    public String getPage(String urlString) throws JSONException, IOException, InterruptedException {
        possibleDelay(urlString);

        URL url = new URL(urlString);
        InputStream is = null;
        BufferedReader br = null;
        StringBuffer sb = new StringBuffer();

        try {
            is = url.openStream();
            br = new BufferedReader(new InputStreamReader(is));
            String line = null;
            while ((line = br.readLine()) != null) {
                sb.append(line + "\r\n");
            }
        } finally {
            try {
                if (is != null) {
                    is.close();
                }
            } catch (IOException ioe) {

            }
        }
        return sb.toString();
    }

    /**
     * Creates a CommentThread object from a properly constructed JSONArray derived from a reddit response.
     * 
     * @param threadJSON
     * @param retrieveMoreComments if false, comments will not be recursively requested.
     *  this is ideal to gain thread metrics. if true, all comments will be recursively
     *  requested until the thread is completely consumed.  This takes a lot of time and
     *  requests for large threads.
     * @return
     * @throws JSONException
     * @throws IOException
     * @throws InterruptedException
     */
    public CommentThread parseJSON(JSONArray threadJSON, boolean retrieveMoreComments)
            throws JSONException, IOException, InterruptedException {
        CommentThread commentThread = new CommentThread();

        JSONObject listing = threadJSON.getJSONObject(0).getJSONObject("data").getJSONArray("children")
                .getJSONObject(0).getJSONObject("data");
        commentThread.setDownvotes(listing.getLong("downs"));
        commentThread.setUpvotes(listing.getLong("ups"));
        commentThread.setSelftext(listing.getString("selftext"));
        commentThread.setSubreddit(listing.getString("subreddit"));
        commentThread.setTimestamp(listing.getLong("created_utc"));
        commentThread.setAuthor(listing.getString("author"));
        commentThread.setUrl(listing.getString("url"));
        commentThread.setOver18(listing.getBoolean("over_18"));
        commentThread.setTitle(listing.getString("title"));
        commentThread.setNameHash(listing.getString("name"));
        commentThread.setId(listing.getString("id"));

        JSONArray comments = threadJSON.getJSONObject(1).getJSONObject("data").getJSONArray("children");

        parseComments(comments, commentThread, retrieveMoreComments);

        return commentThread;
    }

    /**
     * Digs through a JSONArray of comments and extracts relevant information in order to construct
     * a CommentThread object.  There is a lot of assumed information in here based on the reddit api.
     * 
     * The comments are flattened and added to a List for this thread.  The tree is disregarded.  You 
     * may need to provide your own implementation if a tree is desired.
     * @param jsonComments
     * @param commentThread
     * @param retrieveMoreComments - whether or not to request more comments
     * @throws JSONException
     * @throws IOException
     * @throws InterruptedException
     */
    public void parseComments(JSONArray jsonComments, CommentThread commentThread, boolean retrieveMoreComments)
            throws JSONException, IOException, InterruptedException {

        for (int i = 0; i < jsonComments.length(); i++) {
            JSONObject data = jsonComments.getJSONObject(i).getJSONObject("data");
            String kind = jsonComments.getJSONObject(i).getString("kind");

            if (!kind.equals("more")) {
                if (processedCommentIds.contains(data.getString("id"))) {
                    return;
                }

                parseComment(data, commentThread);

                if (data.optJSONObject("replies") != null) {
                    JSONArray root = data.getJSONObject("replies").getJSONObject("data").getJSONArray("children");
                    for (int childrenIndex = 0; childrenIndex < root.length(); childrenIndex++) {
                        JSONObject childObject = root.getJSONObject(childrenIndex);

                        if (childObject.getString("kind").equals("more")) {
                            JSONArray replies = childObject.getJSONObject("data").getJSONArray("children");
                            for (int replyIndex = 0; replyIndex < replies.length(); replyIndex++) {
                                if (retrieveMoreComments) {
                                    fetchMoreComments(commentThread.getId(), replies.getString(replyIndex),
                                            commentThread);
                                }
                            }
                        }
                        // "kind": "t1"
                        else if (childObject.getJSONObject("data").optJSONObject("replies") != null) {
                            parseComment(childObject.getJSONObject("data"), commentThread);
                            parseComments(childObject.getJSONObject("data").getJSONObject("replies")
                                    .getJSONObject("data").getJSONArray("children"), commentThread,
                                    retrieveMoreComments);
                        }
                        // this IS the child comment; there are no replies
                        else {
                            parseComment(childObject.getJSONObject("data"), commentThread);
                        }
                    }
                }
            } else {
                JSONArray children = data.getJSONArray("children");
                for (int childIndex = 0; childIndex < children.length(); childIndex++) {
                    if (processedCommentIds.contains(children.getString(childIndex))) {
                        return;
                    }
                    if (retrieveMoreComments) {
                        fetchMoreComments(commentThread.getId(), children.getString(childIndex), commentThread);
                    }
                }
            }
        }
    }

    /**
     * Extracts a single comment from JSON and creates a Comment object.
     * @param comment
     * @param commentThread
     * @throws JSONException
     */
    public void parseComment(JSONObject comment, CommentThread commentThread) throws JSONException {
        Comment c = new Comment();
        c.setCommentThread(commentThread);
        c.setContents(comment.getString("body"));
        c.setDownvotes(comment.getLong("downs"));
        c.setUpvotes(comment.getLong("ups"));
        c.setTimestamp(comment.getLong("created_utc"));
        c.setAuthor(comment.getString("author"));
        c.setId(comment.getString("id"));

        processedCommentIds.add(c.getId());
        commentThread.getComments().add(c);
    }

    /**
     * The JSON response indicated that more comments exist but they were not returned in this
     * result.  Send another request for additional comments. There is potentially a lot of
     * recursion going on here as we start fresh by requesting a new page, which will have
     * comments, which may have additional comments that need to be fetched as well in a new request.
     * 
     * @param parentId
     * @param commentId
     * @param commentThread
     * @throws JSONException
     * @throws IOException
     * @throws InterruptedException
     */
    public void fetchMoreComments(String parentId, String commentId, CommentThread commentThread)
            throws JSONException, IOException, InterruptedException {
        JSONArray commentJSON = null;
        boolean complete = false;
        while (!complete) {
            try {
                commentJSON = new JSONArray(getPage(
                        "http://www.reddit.com/comments/" + parentId + "/_/" + commentId + ".json?limit=500"));
                complete = true;
            }
            // I found that this will often fail as reddit can be overloaded and may time out.
            // this will continue attempts until a result is achieved.
            catch (IOException e) {
                Thread.sleep(RedditScraperBot.MINIMUM_TIME_BETWEEN_REQUESTS_IN_MS);
            }
        }
        parseComments(commentJSON.getJSONObject(1).getJSONObject("data").getJSONArray("children"), commentThread,
                true);
    }

    /**
     * Experimental right now.  I messed around with this but never really used it for anything.
     * 
     * @param url should be new URL("https://ssl.reddit.com/api/login/myusername");
     * @param user
     * @param pw
     * @throws IOException
     * @throws JSONException
     */
    public void login(URL url, String user, String pw) throws IOException, JSONException {

        String data = "api_type=json&user=" + user + "&passwd=" + pw;
        HttpURLConnection httpUrlConn = null;
        httpUrlConn = (HttpURLConnection) url.openConnection();
        httpUrlConn.setRequestMethod("POST");
        httpUrlConn.setDoOutput(true);
        httpUrlConn.setUseCaches(false);
        httpUrlConn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");
        httpUrlConn.setRequestProperty("Content-Length", String.valueOf(data.length()));

        DataOutputStream dataOutputStream = new DataOutputStream(httpUrlConn.getOutputStream());
        dataOutputStream.writeBytes(data);
        dataOutputStream.flush();
        dataOutputStream.close();
        InputStream is = httpUrlConn.getInputStream();
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(is));
        String line;
        StringBuffer response = new StringBuffer();
        while ((line = bufferedReader.readLine()) != null) {
            response.append(line);
            response.append('\r');
        }
        for (Entry<String, List<String>> r : httpUrlConn.getHeaderFields().entrySet()) {
            System.out.println(r.getKey() + ": " + r.getValue());
        }
        bufferedReader.close();
        System.out.println("Response: " + response.toString());
        this.setModHash(new JSONObject(response.toString()).getJSONObject("json").getJSONObject("data")
                .getString("modhash"));
        this.setCookie(new JSONObject(response.toString()).getJSONObject("json").getJSONObject("data")
                .getString("cookie"));

    }

    public Timer getTimer() {
        return timer;
    }

    public void setTimer(Timer timer) {
        this.timer = timer;
    }

    public String getModHash() {
        return modHash;
    }

    public void setModHash(String modHash) {
        this.modHash = modHash;
    }

    public String getCookie() {
        return cookie;
    }

    public void setCookie(String cookie) {
        this.cookie = cookie;
    }
}