com.marklogic.mapreduce.examples.WikiLoader.java Source code

Introduction

Here is the source code for com.marklogic.mapreduce.examples.WikiLoader.java
Source

/*
 * Copyright 2003-2016 MarkLogic Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.marklogic.mapreduce.examples;

import info.bliki.wiki.model.WikiModel;

import java.io.IOException;
import java.io.StringReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.GenericOptionsParser;
import org.xmlpull.v1.XmlPullParser;
import org.xmlpull.v1.XmlPullParserException;
import org.xmlpull.v1.XmlPullParserFactory;

import com.marklogic.cpox.SimpleLogger;
import com.marklogic.cpox.Utilities;
import com.marklogic.mapreduce.ContentOutputFormat;
import com.marklogic.mapreduce.DocumentURI;
import com.marklogic.xcc.Session;

/**
 * Load wiki documents from HDFS into MarkLogic Server.
 * Used with the configuration file conf/marklogic-wiki.xml.
 */

public class WikiLoader {
    public static class ArticleMapper extends Mapper<Text, Text, DocumentURI, Text> {

        private DocumentURI uri = new DocumentURI();

        public void map(Text path, Text page, Context context) throws IOException, InterruptedException {
            uri.setUri(path.toString());
            context.write(uri, page);
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        if (args.length < 2) {
            System.err.println("Usage: WikiLoader configFile inputDir");
            System.exit(2);
        }
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

        Job job = Job.getInstance(conf, "wiki loader");
        job.setJarByClass(WikiLoader.class);
        job.setInputFormatClass(WikiInputFormat.class);
        job.setMapperClass(ArticleMapper.class);
        job.setMapOutputKeyClass(DocumentURI.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputFormatClass(ContentOutputFormat.class);

        ContentInputFormat.setInputPaths(job, new Path(otherArgs[1]));

        conf = job.getConfiguration();
        conf.addResource(otherArgs[0]);

        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

class WikiInputFormat extends FileInputFormat<Text, Text> {

    @Override
    protected boolean isSplitable(JobContext context, Path filename) {
        return true;
    }

    @Override
    public RecordReader<Text, Text> createRecordReader(InputSplit split, TaskAttemptContext context)
            throws IOException, InterruptedException {
        return new WikiReader();
    }

}

class Article {
    String title;
    StringBuilder pageContent;

    public Article(String title, StringBuilder pageContent) {
        this.title = title;
        this.pageContent = pageContent;
    }
}

class WikiReader extends RecordReader<Text, Text> {

    static final int BUFFER_SIZE = 65536;
    static final int READ_AHEAD_SIZE = 2048;
    static final String BEGIN_PAGE_TAG = "<page>";
    static final String END_PAGE_TAG = "</page>";
    static final String END_DOC_TAG = "</mediawiki>";
    private Text key = new Text();
    private Text value = new Text();
    private List<Article> articles;
    private int recordCount = 0;

    public WikiReader() {
    }

    @Override
    public void close() throws IOException {
    }

    @Override
    public Text getCurrentKey() throws IOException, InterruptedException {
        return key;
    }

    @Override
    public Text getCurrentValue() throws IOException, InterruptedException {
        return value;
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {
        if (articles == null || articles.isEmpty()) {
            return 0;
        }
        return recordCount / (float) articles.size();
    }

    @Override
    public void initialize(InputSplit inSplit, TaskAttemptContext context)
            throws IOException, InterruptedException {
        Path file = ((FileSplit) inSplit).getPath();
        FileSystem fs = file.getFileSystem(context.getConfiguration());
        FSDataInputStream fileIn = fs.open(file);
        byte[] buf = new byte[BUFFER_SIZE];
        long bytesTotal = inSplit.getLength();
        long start = ((FileSplit) inSplit).getStart();
        fileIn.seek(start);
        long bytesRead = 0;
        StringBuilder pages = new StringBuilder();
        int sindex = -1;
        while (true) {
            int length = (int) Math.min(bytesTotal - bytesRead, buf.length);
            int read = fileIn.read(buf, 0, length);
            if (read == -1) {
                System.out.println("Unexpected EOF: bytesTotal=" + bytesTotal + "bytesRead=" + bytesRead);
                break;
            }
            bytesRead += read;
            String temp = new String(new String(buf, 0, read));
            if (sindex == -1) { // haven't found the start yet    
                sindex = temp.indexOf(BEGIN_PAGE_TAG);
                if (sindex > -1) {
                    pages.append(temp.substring(sindex));
                }
            } else if (bytesRead < bytesTotal) { // haven't completed the split
                pages.append(temp);
            } else { // reached the end of this split
                // look for end
                int eindex = 0;
                if (temp.contains(END_DOC_TAG) || // reached the end of doc
                        temp.endsWith(END_PAGE_TAG)) {
                    eindex = temp.lastIndexOf(END_PAGE_TAG);
                    pages.append(temp.substring(0, eindex + END_PAGE_TAG.length()));
                    System.out.println("Found end of doc.");
                } else { // need to read ahead to look for end of page
                    while (true) {
                        read = fileIn.read(buf, 0, READ_AHEAD_SIZE);
                        if (read == -1) { // no more to read
                            System.out
                                    .println("Unexpected EOF: bytesTotal=" + bytesTotal + "bytesRead=" + bytesRead);
                            System.out.println(temp);
                            break;
                        }
                        bytesRead += read;
                        // look for end
                        temp = new String(buf, 0, read);
                        eindex = temp.indexOf(END_PAGE_TAG);
                        if (eindex > -1) {
                            pages.append(temp.substring(0, eindex + END_PAGE_TAG.length()));
                            break;
                        } else {
                            pages.append(temp);
                        }
                    }
                }
                break;
            }
        }
        fileIn.close();
        articles = WikiModelProcessor.process(pages);
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        if (articles != null && articles.size() > recordCount) {
            Article article = articles.get(recordCount);
            key.set(article.title);
            value.set(article.pageContent.toString());
            recordCount++;
            return true;
        }
        return false;
    }

    static class WikiModelProcessor {
        /**
         * 
         */
        private static final String TITLE = "title";

        /**
         * 
         */
        private static final String PAGE = "page";

        private static final String ROOT = "mediawiki";

        private static final String NS_XML = "http://www.w3.org/XML/1998/namespace";

        private static final String HEADER = "<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.4/\" "
                + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
                + "xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.4/"
                + "http://www.mediawiki.org/xml/export-0.4.xsd\" version=\"0.4\" " + "xml:lang=\"en\"> \n"
                + "  <siteinfo> \n" + "    <sitename>Wikipedia</sitename> \n"
                + "    <base>http://en.wikipedia.org/wiki/Main_Page</base> \n"
                + "    <generator>MediaWiki 1.16alpha-wmf</generator> \n" + "    <case>first-letter</case> \n"
                + "    <namespaces> \n" + "      <namespace key=\"-2\">Media</namespace> \n"
                + "      <namespace key=\"-1\">Special</namespace> \n" + "      <namespace key=\"0\" /> \n"
                + "      <namespace key=\"1\">Talk</namespace> \n"
                + "      <namespace key=\"2\">User</namespace> \n"
                + "      <namespace key=\"3\">User talk</namespace> \n"
                + "      <namespace key=\"4\">Wikipedia</namespace> \n"
                + "      <namespace key=\"5\">Wikipedia talk</namespace> \n"
                + "      <namespace key=\"6\">File</namespace> \n"
                + "      <namespace key=\"7\">File talk</namespace> \n"
                + "      <namespace key=\"8\">MediaWiki</namespace> \n"
                + "      <namespace key=\"9\">MediaWiki talk</namespace> \n"
                + "      <namespace key=\"10\">Template</namespace> \n"
                + "      <namespace key=\"11\">Template talk</namespace> \n"
                + "      <namespace key=\"12\">Help</namespace> \n"
                + "      <namespace key=\"13\">Help talk</namespace> \n"
                + "      <namespace key=\"14\">Category</namespace> \n"
                + "      <namespace key=\"15\">Category talk</namespace> \n"
                + "      <namespace key=\"100\">Portal</namespace> \n"
                + "      <namespace key=\"101\">Portal talk</namespace> \n" + "    </namespaces> \n"
                + "  </siteinfo> \n";

        private static final String FOOTER = "\n</mediawiki>";

        private static LinkedList<String> path;

        private static StringBuilder article;

        private static String title;

        private static XmlPullParser xpp;

        static SimpleLogger logger = SimpleLogger.getSimpleLogger();

        private static int errors = 0;

        private static int pages = 0;

        private static String namespace;

        private static String language;

        private static XmlPullParserFactory factory;

        private static XmlPullParser parser;

        private static Session session;

        private static List<Article> articles;

        /**
         * @param args
         * @throws Exception
         */
        public static List<Article> process(StringBuilder input) {
            input.insert(0, HEADER);
            input.append(FOOTER);
            Properties properties = new Properties();
            try {
                factory = XmlPullParserFactory
                        .newInstance(properties.getProperty(XmlPullParserFactory.PROPERTY_NAME), null);
                factory.setNamespaceAware(true);
                xpp = factory.newPullParser();
                xpp.setInput(new StringReader(input.toString()));

                // TODO feature isn't supported by xpp3 - look at xpp5?
                // xpp.setFeature(XmlPullParser.FEATURE_DETECT_ENCODING, true);
                // TODO feature isn't supported by xpp3 - look at xpp5?
                // xpp.setFeature(XmlPullParser.FEATURE_PROCESS_DOCDECL, true);
                xpp.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES, true);

                logger.configureLogger(new Properties());

                process();
            } catch (Exception ex) {
                logger.logException(ex);
            }
            logger.info("finished " + pages + " pages with " + errors + " errors");
            return articles;
        }

        /**
         * @throws IOException
         * @throws XmlPullParserException
         */
        private static void process() throws XmlPullParserException, IOException {
            // transform to final output
            int event;
            path = new LinkedList<String>();
            article = null;
            title = null;

            logger.info("starting loop");

            while (true) {
                event = xpp.next();
                switch (event) {
                case XmlPullParser.END_DOCUMENT:
                    processEndDocument();
                    // exit the loop
                    return;
                case XmlPullParser.END_TAG:
                    processEndElement(xpp.getName());
                    break;
                case XmlPullParser.START_TAG:
                    processStartElement(xpp.getName());
                    break;
                case XmlPullParser.TEXT:
                    if (null != article) {
                        String name = path.getLast();
                        if ("comment".equals(name) || "text".equals(name)) {
                            // parse comment elements
                            // parse text elements
                            article.append(parse(xpp.getText()));
                        } else {
                            article.append(Utilities.escapeXml(xpp.getText()));
                        }
                    }
                    break;
                default:
                    throw new IOException("unexpected event: " + event + " at " + xpp.getPositionDescription());
                }
            }
        }

        /**
         * @param text
         * @return
         * @throws IOException
         */
        private static String parse(String text) throws IOException {
            if (null == text || "".equals(text.trim())) {
                return null;
            }
            // parse wiki markup to xml
            // TODO: this is slow with bliki - might need concurrency
            // use a new object every time, to prevent leaks
            // no doubt this makes it slower...
            String xml = new WikiModel("${image}", "${title}").render(text);

            if (null == xml || "".equals(xml.trim())) {
                return xml;
            }

            // verify xml is well-formed
            try {
                // use this xpp object to check output from the wikimedia parser
                parser = factory.newPullParser();
                parser.setInput(new StringReader("<dummy>" + xml + "</dummy>"));
                parser.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES, true);
                int event;
                String temp;
                char[] chars;
                int c;
                while (true) {
                    // with some Japanese text, next() throws
                    // ArrayIndexOutOfBoundsException
                    try {
                        event = parser.next();
                    } catch (ArrayIndexOutOfBoundsException e) {
                        throw new XmlPullParserException(e.getMessage(), parser, null);
                    }
                    switch (event) {
                    case XmlPullParser.END_DOCUMENT:
                        // exit the loop
                        return xml;
                    case XmlPullParser.END_TAG:
                        parser.getName();
                        parser.getNamespace();
                        parser.getText();
                        break;
                    case XmlPullParser.START_TAG:
                        parser.getName();
                        parser.getNamespace();
                        parser.getText();
                        break;
                    case XmlPullParser.TEXT:
                        temp = parser.getText();
                        if (null != temp) {
                            chars = temp.toCharArray();
                            // xpp3 doesn't check codepoint values
                            // check them to avoid XDMP errors
                            for (int i = 0; i < chars.length; i++) {
                                c = chars[i];
                                // #x9 | #xA | #xD
                                // | [#x20-#xD7FF]
                                // | [#xE000-#xFFFD]
                                // | [#x10000-#x10FFFF]
                                // this implementation is abbreviated
                                if (9 == c || 10 == c || 13 == c || c > 31) {
                                    continue;
                                }
                                throw new XmlPullParserException("bad codepoint value: " + c, parser, null);
                            }
                        }
                        break;
                    default:
                        throw new IOException(
                                "unexpected event: " + event + " at " + parser.getPositionDescription());
                    }
                }
            } catch (XmlPullParserException e) {
                logger.warning(title + ": " + e.getMessage());
                errors++;
                return Utilities.escapeXml(text);
            }
        }

        /**
         * @param name
         * @throws IOException
         */
        private static void processEndElement(String name) throws IOException {
            // logger.info(name);
            if (!path.getLast().equals(name)) {
                throw new IOException(
                        "found " + name + " expected " + path.getLast() + "; " + title + "; " + article);
            }
            path.removeLast();

            if (null == article) {
                return;
            }

            article.append(xpp.getText());

            // look for end of article
            if (!PAGE.equals(name)) {
                return;
            }

            boolean encodeTitle = false;
            URI uri = null;
            if (encodeTitle) {
                // try encoding the entry name
                try {
                    // this form of URI() does escaping nicely
                    uri = new URI(null, title, null);
                } catch (URISyntaxException e) {
                    try {
                        // URI(schema, ssp, fragment) constructor cannot handle
                        // ssp = 2008-11-07T12:23:47.617766-08:00/1
                        // (despite what the javadoc says)...
                        // in this situation, treat the path as the fragment.
                        uri = new URI(null, null, title);
                    } catch (URISyntaxException e1) {
                        throw new IOException(e);
                    }
                }
            }

            // add article to list
            // include the language in the title        
            String path = language + "wiki/" + (encodeTitle ? uri.toString() : title);
            if (articles == null) {
                articles = new ArrayList<Article>();
            }
            articles.add(new Article(path, article));

            // ready for the next page
            article = null;
        }

        /**
         * @param name
         * @throws IOException
         * @throws XmlPullParserException
         */
        private static void processStartElement(String name) throws IOException, XmlPullParserException {
            // logger.info(name);
            path.add(name);
            // look for start of article
            if (ROOT.equals(name)) {
                namespace = xpp.getNamespace();
                language = xpp.getAttributeValue(NS_XML, "lang");
                return;
            }

            if (PAGE.equals(name)) {
                if (null != article) {
                    throw new IOException("article not null at start of page");
                }
                // this is clumsy, but should work ok
                article = new StringBuilder("<" + PAGE
                // propagate the XML namespace
                        + (null == namespace ? "" : (" xmlns=\"" + namespace + "\""))
                        // propagate the xml:lang attribute
                        + (null == language ? "" : (" xml:lang=\"" + language + "\""))
                        // end of the start tag
                        + ">");
                pages++;
                return;
            }

            if (null != article && !xpp.isEmptyElementTag()) {
                // write empty elements via end-element, only.
                // note that attributes are still ok in this case
                article.append(xpp.getText());
            }

            if (!TITLE.equals(name)) {
                return;
            }

            // create zip entry when we see the title element
            title = xpp.nextText().trim();
            article.append(Utilities.escapeXml(title));
            // this puts us at the end element for title
            processEndElement(name);
        }

        /**
         * @throws IOException
         */
        private static void processEndDocument() throws IOException {
            if (0 != path.size()) {
                throw new IOException("document end before end tag (" + path.size() + ") " + path.getLast() + " "
                        + xpp.getPositionDescription());
            }
            if (null != article) {
                throw new IOException("article not null at end of document: " + title + "; " + article.toString()
                        + "; " + xpp.getPositionDescription());
            }
            if (session != null) {
                session.close();
            }
        }
    }
}