net.inervo.WMFWiki11.java Source code

Java tutorial

Introduction

Here is the source code for net.inervo.WMFWiki11.java

Source

package net.inervo;

/*
 * Copyright (c) 2011, Ted Timmons, Inervo Networks All rights reserved.
 * 
 * LICENSE EXCEPTION: User:MER-C on Wikipedia may remove this license and use the code however necessary to integrate
 * with the Wiki.java project: http://code.google.com/p/wiki-java/
 * 
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
 * following conditions are met:
 * 
 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following
 * disclaimer. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the
 * following disclaimer in the documentation and/or other materials provided with the distribution. Neither the name of
 * Inervo Networks nor the names of its contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.GregorianCalendar;
import java.util.List;
import java.util.Map.Entry;
import java.util.logging.Level;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;

public class WMFWiki11 extends org.wikipedia.WMFWiki {
    private static final long serialVersionUID = 1L;

    // only here until this is merged.
    public WMFWiki11(String domain) {
        super(domain);
    }

    /**
     * Lazy helper function to check for the existence of a string and see if it is empty.
     * 
     * @param str
     * @return boolean
     */
    protected boolean isEmptyOrNull(String str) {
        if (str == null || str.isEmpty()) {
            return true;
        }
        return false;
    }

    /**
     * Helper function to wrap our Long.parseLong call.
     * 
     * @param input
     * @return long
     */
    protected long tryLongParse(String input) {
        long ret = 0L;
        try {
            ret = Long.parseLong(input);
        } catch (NumberFormatException ex) {
            return -1L;
        }

        return ret;
    }

    /**
     * Helper function to wrap our Long.parseLong call.
     * 
     * @param input
     * @return long
     */
    protected int tryIntParse(String input) {
        try {
            return Integer.parseInt(input);
        } catch (NumberFormatException ex) {
        }

        return -1;
    }

    /**
     * Gets the revision history of a page between two dates.
     * 
     * @param title
     *            a page
     * @param start
     *            the date to start enumeration (the latest of the two dates)
     * @param end
     *            the date to stop enumeration (the earliest of the two dates)
     * @return the revisions of that page in that time span
     * @throws IOException
     *             if a network error occurs
     * @since 0.19
     */

    public WMFWiki11RevisionText getTopRevision(String title) throws IOException {
        String url = apiUrl
                + "action=query&prop=revisions&rvprop=timestamp|user|comment|size|content|ids|flags|size&rvlimit=1&format=json&titles="
                + URLEncoder.encode(title, "UTF-8");
        String pageContent = fetch(url, "getTopRevision");

        JsonParser parser = new JsonParser();
        JsonElement headElement = parser.parse(pageContent);

        JsonObject head = headElement.getAsJsonObject();
        JsonObject query = head.get("query").getAsJsonObject();
        JsonObject pages = query.get("pages").getAsJsonObject();

        if (pages.entrySet().size() == 0) {
            // no entries.
            return null;
        } else if (pages.entrySet().size() > 1) {
            throw new IOException("we expected one entry. This is awkward, we don't know what to do from here.");
        }

        JsonObject page = null;
        for (Entry<String, JsonElement> entry : pages.entrySet()) {
            page = entry.getValue().getAsJsonObject();
            break;
        }
        String outTitle = page.get("title").getAsString();

        if (page.get("revisions") == null || page.get("revisions").getAsJsonArray().size() < 1) {
            return null;
        }

        JsonObject revision = page.get("revisions").getAsJsonArray().get(0).getAsJsonObject();

        long revid = revision.get("revid").getAsLong();
        Calendar timestamp = timestampToCalendar(convertTimestamp(revision.get("timestamp").getAsString()));
        String summary = revision.get("comment").getAsString();
        String user = revision.get("user").getAsString();

        boolean minor = isEmptyOrNull(revision.get("minor"));
        boolean bot = isEmptyOrNull(revision.get("bot"));
        int size = revision.get("size").getAsInt();
        boolean isPageNew = false;
        if (revision.has("parent") && revision.get("parent").getAsInt() == 0)
            isPageNew = true;

        String content = revision.get("*").getAsString();

        return new WMFWiki11RevisionText(this, revid, timestamp, outTitle, summary, user, minor, bot, content,
                isPageNew, size);
    }

    protected boolean isEmptyOrNull(JsonElement element) {
        if (element == null || element.isJsonNull()) {
            return false;
        }

        String str = element.getAsString();
        if (str.length() == 0 || str.contentEquals("0")) {
            return false;
        }

        return true;
    }

    protected Revision populateRevision(JsonObject page) {

        return null;
    }

    public static String fetchPage(String urlString) throws Exception {
        URL url = new URL(urlString);
        StringBuilder content = new StringBuilder();
        BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));

        String inputLine;

        while ((inputLine = in.readLine()) != null) {
            // System.out.println("il: " + inputLine);
            content.append(inputLine);
        }

        in.close();
        return content.toString();
    }

    /******************************/

    /**
     * From a DOM Element object, parse and return the Revision object
     * 
     * @param DOM
     *            Element
     * @return Revision object
     */
    protected Revision parseRevision(Element ele) {
        long oldid = tryLongParse(ele.getAttribute("revid"));
        log(Level.INFO, ele.getAttribute("timestamp"), "timestamp");
        Calendar timestamp = timestampToCalendar(convertTimestamp(ele.getAttribute("timestamp")));
        String title = ele.getAttribute("title");
        String summary = ele.getAttribute("commenthidden") == null ? ele.getAttribute("comment") : null;
        String user2 = ele.getAttribute("user");
        boolean minor = isEmptyOrNull(ele.getAttribute("minor"));
        boolean bot = isEmptyOrNull(ele.getAttribute("bot"));
        int size = tryIntParse(ele.getAttribute("size"));
        boolean isPageNew = tryIntParse(ele.getAttribute("parent")) == 0;

        Revision revision = new Revision(oldid, timestamp, title, summary, user2, minor, bot, isPageNew, size);
        revision.setRcid(tryLongParse(ele.getAttribute("rcid")));
        return revision;
    }

    /**
     * fetch and return new pages from a given time until the correct number of results are returned.
     * 
     * @param amount
     *            number of results requested
     * @param namespace
     *            the namespace to search
     * @param rcoptions
     *            a bitmask of HIDE_ANON etc that dictate which pages we return (e.g. exclude patrolled pages =>
     *            rcoptions = HIDE_PATROLLED).
     * @param GregorianCalendar
     *            start time
     * @return Revisions object, which contains the list of Revision objects and the rcstart continuation
     * @throws IOException
     *             on network error
     * @throws ParserConfigurationException
     *             on parsing error
     * @throws SAXException
     *             on parsing error
     */
    public Revisions newPages(int amount, int namespace, int rcoptions, Calendar start)
            throws IOException, ParserConfigurationException, SAXException {
        return newPages(amount, namespace, rcoptions, start, new GregorianCalendar());
    }

    /**
     * fetch and return new pages from a given time until the correct number of results are returned.
     * 
     * @param amount
     *            number of results requested
     * @param namespace
     *            the namespace to search
     * @param rcoptions
     *            a bitmask of HIDE_ANON etc that dictate which pages we return (e.g. exclude patrolled pages =>
     *            rcoptions = HIDE_PATROLLED).
     * @param start
     *            time, string
     * @return Revisions object, which contains the list of Revision objects and the rcstart continuation
     * @throws IOException
     *             on network error
     * @throws ParserConfigurationException
     *             on parsing error
     * @throws SAXException
     *             on parsing error
     */
    public Revisions newPages(int amount, int namespace, int rcoptions, String start)
            throws IOException, ParserConfigurationException, SAXException {
        return newPages(amount, namespace, rcoptions, timestampToCalendar(start), new GregorianCalendar());
    }

    /**
     * fetch and return new pages from a given time until the correct number of results are returned or the end time is
     * reached.
     * 
     * @param amount
     *            number of results requested
     * @param namespace
     *            the namespace to search
     * @param rcoptions
     *            a bitmask of HIDE_ANON etc that dictate which pages we return (e.g. exclude patrolled pages =>
     *            rcoptions = HIDE_PATROLLED).
     * @param GregorianCalendar
     *            start time
     * @param GregorianCalendar
     *            end time
     * @return Revisions object, which contains the list of Revision objects and the rcstart continuation
     * @throws IOException
     *             on network error
     * @throws ParserConfigurationException
     *             on parsing error
     * @throws SAXException
     *             on parsing error
     */
    public Revisions newPages(int amount, int namespace, int rcoptions, Calendar start, Calendar end)
            throws IOException, ParserConfigurationException, SAXException {
        StringBuilder url = new StringBuilder(query);
        url.append(
                "action=query&list=recentchanges&rcprop=title%7Cids%7Cuser%7Ctimestamp%7Cflags%7Ccomment&rclimit=max&rcdir=newer&rctype=new&rcend=");
        url.append(calendarToTimestamp(end));

        if (namespace != ALL_NAMESPACES) {
            url.append("&rcnamespace=");
            url.append(namespace);
        }
        // rc options
        if (rcoptions > 0) {
            url.append("&rcshow=");
            if ((rcoptions & HIDE_ANON) == HIDE_ANON)
                url.append("!anon%7C");
            if ((rcoptions & HIDE_SELF) == HIDE_SELF)
                url.append("!self%7C");
            if ((rcoptions & HIDE_MINOR) == HIDE_MINOR)
                url.append("!minor%7C");
            if ((rcoptions & HIDE_PATROLLED) == HIDE_PATROLLED)
                url.append("!patrolled%7C");
            if ((rcoptions & HIDE_BOT) == HIDE_BOT)
                url.append("!bot");
            // chop off last |
            url.delete(url.length() - 3, url.length());
        }

        // fetch, parse
        url.append("&rcstart=");
        String rcstart = calendarToTimestamp(start);

        ArrayList<Revision> revisions = new ArrayList<Revision>(amount);
        do {
            String temp = url.toString();
            String line = null;

            try {
                line = fetch(temp + rcstart, "newPages");
            } catch (IOException ex) {
                log(Level.WARNING, "fetching newPages returned an IOException. Retrying.", "recentChangesFFF");

                // retry once.
                line = fetch(temp + rcstart, "newPages");
            }

            // DOM XML parser
            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
            DocumentBuilder db = dbf.newDocumentBuilder();
            Document dom = db.parse(new ByteArrayInputStream(line.getBytes("UTF-8")));

            Element docEle = dom.getDocumentElement();
            rcstart = parseRecentChangesDocument(docEle, revisions, amount);

            log(Level.INFO, "revsize: " + revisions.size() + ", amount: " + amount + ", rcstart: " + rcstart,
                    "recentChangesFFF");

        } while (amount > revisions.size() && rcstart != null && rcstart.length() > 0);

        return new Revisions(revisions, rcstart);
    }

    /**
     * Parse the DOM tree associated with a Recent Changes listing.
     * 
     * @param docEle
     * @param revisions
     * @param amount
     * @return rcstart string for our next set of results
     */
    protected String parseRecentChangesDocument(Element docEle, ArrayList<Revision> revisions, int amount) {
        NodeList rcitems = docEle.getElementsByTagName("recentchanges");

        String nextStart = null;
        boolean oneMore = false;
        OUTER: for (int j = 0; j < rcitems.getLength(); ++j) {
            Node rcitem = rcitems.item(j);
            NodeList items = rcitem.getChildNodes();
            String parent = rcitem.getParentNode().getNodeName();
            if (!parent.equalsIgnoreCase("query")) {
                continue;
            }

            for (int i = 0; i < items.getLength(); ++i) {
                Node nodeItem = items.item(i);

                Revision rev = parseRevision((Element) nodeItem);
                log(Level.INFO, "have rev: " + rev.getPage(), "parseRecentChangesDocument");

                if (oneMore) {
                    nextStart = calendarToTimestamp(rev.getTimestamp());
                    break OUTER;
                }

                revisions.add(rev);
                if (revisions.size() >= amount) {
                    // we have enough, but we need to go a step further to get the next start time.
                    log(Level.INFO, "have enough revs", "parseRecentChangesDocument");
                    oneMore = true;
                }

            }
        }

        // if we didn't get a start from having enough revisions, we can get it from query-continue.
        if (nextStart == null) {
            nextStart = parseQueryContinue(docEle.getElementsByTagName("query-continue"));
        }
        return nextStart;
    }

    /**
     * parse the query-continue section of DOM, which is where the next rcstart value is.
     * 
     * @param qcitems
     *            , a NodeList starting with the query-continue element(s).
     * @return the rcstart String, parsed to a timestamp
     */
    protected String parseQueryContinue(NodeList qcitems) {
        String nextStart = null;

        OUTER: for (int j = 0; j < qcitems.getLength(); ++j) {
            Node rcitem = qcitems.item(j);
            NodeList items = rcitem.getChildNodes();

            for (int i = 0; i < items.getLength(); ++i) {
                Node nodeItem = items.item(i);
                String attribute = ((Element) nodeItem).getAttribute("rcstart");
                if (attribute != null && attribute.length() > 0) {
                    nextStart = convertTimestamp(attribute);
                    log(Level.INFO, "query-continue rcstart: " + nextStart, "parseQueryContinue");
                    break OUTER;
                }

                attribute = ((Element) nodeItem).getAttribute("rccontinue");
                if (attribute != null && attribute.length() > 0) {
                    String splitAttr = attribute.split("\\|")[0];
                    nextStart = convertTimestamp(splitAttr);
                    log(Level.INFO, "query-continue rccontinue: " + nextStart, "parseQueryContinue");
                    break OUTER;
                }
            }
        }
        return nextStart;
    }

    /**
     * Fetches the <tt>amount</tt> most recent changes in the main namespace. WARNING: The recent changes table only
     * stores new pages for about a month. It is not possible to retrieve changes before then. Equivalent to
     * [[Special:Recentchanges]].
     * <p>
     * Note: Log entries in recent changes have a revid of 0!
     * 
     * @param amount
     *            the number of entries to return
     * @return the recent changes that satisfy these criteria
     * @throws IOException
     *             if a network error occurs
     * @since 0.23
     */
    public Revision[] recentChanges(int amount) throws IOException {
        return recentChanges(amount, 0, false, MAIN_NAMESPACE);
    }

    /**
     * Turns a calendar into a timestamp of the format yyyymmddhhmmss. Might be useful for subclasses.
     * 
     * @param c
     *            the calendar to convert
     * @return the converted calendar
     * @see #timestampToCalendar
     * @since 0.08
     */
    protected String calendarToTimestamp(Calendar c) {
        return String.format("%04d%02d%02d%02d%02d%02d", c.get(Calendar.YEAR), c.get(Calendar.MONTH) + 1,
                c.get(Calendar.DAY_OF_MONTH), c.get(Calendar.HOUR_OF_DAY), c.get(Calendar.MINUTE),
                c.get(Calendar.SECOND));
    }

    /**
     * Stores a list of revisions and the rcstart for a continuation.
     */
    public class Revisions {
        List<Revision> revs = null;
        String rcstart = null;

        public Revisions(List<Revision> revs, String rcstart) {
            this.revs = revs;
            this.rcstart = rcstart;
        }

        public List<Revision> getRevisionList() {
            return revs;
        }

        public String getRcStart() {
            return rcstart;
        }
    }
}