org.jamwiki.migrate.MediaWikiXmlImporter.java Source code

Java tutorial

Introduction

Here is the source code for org.jamwiki.migrate.MediaWikiXmlImporter.java

Source

/**
 * Licensed under the GNU LESSER GENERAL PUBLIC LICENSE, version 2.1, dated February 1999.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the latest version of the GNU Lesser General
 * Public License as published by the Free Software Foundation;
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program (LICENSE.txt); if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package org.jamwiki.migrate;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.sql.Timestamp;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Pattern;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.parsers.SAXParser;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.time.DateUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.jamwiki.DataAccessException;
import org.jamwiki.Environment;
import org.jamwiki.WikiBase;
import org.jamwiki.WikiException;
import org.jamwiki.WikiMessage;
import org.jamwiki.model.Namespace;
import org.jamwiki.model.Topic;
import org.jamwiki.model.TopicVersion;
import org.jamwiki.parser.WikiLink;
import org.jamwiki.utils.WikiLogger;
import org.jamwiki.utils.WikiUtil;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/**
 * Provide functionality for importing a Mediawiki XML file into JAMWiki.
 */
public class MediaWikiXmlImporter extends DefaultHandler implements TopicImporter {

    private static final WikiLogger logger = WikiLogger.getLogger(MediaWikiXmlImporter.class.getName());
    /** Maximum number of topic versions that can be stored before being flushed to the database. */
    private static final int MAX_TOPIC_VERSION_BUFFER = 50;
    private static final SAXParserFactory SAX_PARSER_FACTORY;

    /** This map holds the current tag's attribute names and values.  It is cleared after an end-element is called and thus fails for nested elements. */
    private Map<String, String> currentAttributeMap = new HashMap<String, String>();
    /** Map used when converting namespaces.  Created for performance reasons to avoid recompiling patterns. */
    private Map<String, Pattern> convertNamespaceMap = new HashMap<String, Pattern>();
    /** This buffer holds the content of the current element during parsing.  It will be flushed after an end-element tag is reached. */
    private StringBuilder currentElementBuffer = new StringBuilder();
    private Topic currentTopic = null;
    private TopicVersion currentTopicVersion = new TopicVersion();
    private Map<Date, Integer> currentTopicVersions = new TreeMap<Date, Integer>();
    private final Map<String, String> mediawikiNamespaceMap = new HashMap<String, String>();
    private Map<Topic, List<Integer>> parsedTopics = new HashMap<Topic, List<Integer>>();
    private int previousTopicContentLength = 0;
    /** For performance reasons add topic versions to the dabase in batches. */
    private List<TopicVersion> topicVersionBuffer = new ArrayList<TopicVersion>();
    private String virtualWiki;

    static {
        // For big file parsing
        System.setProperty("entityExpansionLimit", "1000000");
        SAX_PARSER_FACTORY = SAXParserFactory.newInstance();
    }

    /**
     *
     */
    public Map<Topic, List<Integer>> importFromFile(File file, String virtualWiki) throws MigrationException {
        this.virtualWiki = virtualWiki;
        this.importWikiXml(file);
        return this.parsedTopics;
    }

    /**
     *
     */
    private void importWikiXml(File file) throws MigrationException {
        FileInputStream fis = null;
        try {
            // at least in 1.5, the SaxParser has a bug where files with names like "%25s"
            // will be read as "%s", generating FileNotFound exceptions.  To work around this
            // issue use a FileInputStream rather than just SAXParser.parse(file, handler)
            fis = new FileInputStream(file);
            SAXParser saxParser = SAX_PARSER_FACTORY.newSAXParser();
            saxParser.parse(fis, this);
        } catch (ParserConfigurationException e) {
            throw new MigrationException(e);
        } catch (IOException e) {
            throw new MigrationException(e);
        } catch (SAXException e) {
            if (e.getCause() instanceof DataAccessException || e.getCause() instanceof WikiException) {
                throw new MigrationException(e.getCause());
            } else {
                throw new MigrationException(e);
            }
        } finally {
            IOUtils.closeQuietly(fis);
        }
    }

    /**
     * Convert the Wikipedia article namespace (if any) to a JAMWiki article namespace.
     */
    private String convertArticleNameFromWikipediaToJAMWiki(String fullName) {
        String ret = fullName;
        int pos = fullName.indexOf(Namespace.SEPARATOR);
        if (pos > 0) {
            String namespace = fullName.substring(0, pos);
            String title = fullName.substring(pos + 1);
            String jamwikiNamespace = mediawikiNamespaceMap.get(namespace);
            if (!StringUtils.isBlank(jamwikiNamespace)) {
                // matching JAMWiki namespace found
                ret = jamwikiNamespace + Namespace.SEPARATOR + title;
            }
        }
        // remove any characters that are valid for Mediawiki but not JAMWiki
        ret = StringUtils.remove(ret, '?');
        return ret;
    }

    /**
     * Convert all namespaces names from MediaWiki to JAMWiki local representation.
     */
    private String convertToJAMWikiNamespaces(String topicContent) {
        // convert all namespaces names from MediaWiki to JAMWiki local representation
        Pattern pattern;
        for (Map.Entry<String, String> entry : mediawikiNamespaceMap.entrySet()) {
            if (entry.getValue() == null || StringUtils.equalsIgnoreCase(entry.getValue(), entry.getKey())) {
                continue;
            }
            pattern = this.convertNamespaceMap.get(entry.getKey());
            if (pattern == null) {
                // convert from Mediawiki to JAMWiki namespaces.  handle "[[", "[[:", "{{", "{{:".
                // note that "?:" is a regex non-capturing group.
                String patternString = "((?:(?:\\[\\[)|(?:\\{\\{))[ ]*(?::)?)" + entry.getKey()
                        + Namespace.SEPARATOR;
                Pattern mediawikiPattern = Pattern.compile(patternString,
                        Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
                pattern = mediawikiPattern;
                this.convertNamespaceMap.put(entry.getKey(), mediawikiPattern);
            }
            topicContent = pattern.matcher(topicContent).replaceAll("$1" + entry.getValue() + Namespace.SEPARATOR);
        }
        return topicContent;
    }

    /**
     *
     */
    private Timestamp parseMediaWikiTimestamp(String timestamp) {
        try {
            Date date = DateUtils.parseDate(timestamp, new String[] { MediaWikiConstants.ISO_8601_DATE_FORMAT });
            return new Timestamp(date.getTime());
        } catch (ParseException e) {
            // FIXME - this should be handled somehow
            return new Timestamp(System.currentTimeMillis());
        }
    }

    /**
     * Initialize the current topic, validating that it does not yet exist.
     */
    private void initCurrentTopic(String topicName) throws SAXException {
        topicName = convertArticleNameFromWikipediaToJAMWiki(topicName);
        WikiLink wikiLink = new WikiLink(null, this.virtualWiki, topicName);
        Topic existingTopic = null;
        try {
            existingTopic = WikiBase.getDataHandler().lookupTopic(this.virtualWiki, topicName, false);
        } catch (DataAccessException e) {
            throw new SAXException("Failure while validating topic name: " + this.virtualWiki + ':' + topicName, e);
        }
        if (existingTopic != null && existingTopic.getVirtualWiki().equals(this.virtualWiki)) {
            // do a second comparison of capitalized topic names in a case-sensitive way
            // since the initial topic lookup will return a case-insensitive match for some
            // namespaces.
            String existingTopicName = (Environment.getBooleanValue(Environment.PROP_PARSER_ALLOW_CAPITALIZATION))
                    ? StringUtils.capitalize(existingTopic.getPageName())
                    : existingTopic.getPageName();
            String importTopicName = (Environment.getBooleanValue(Environment.PROP_PARSER_ALLOW_CAPITALIZATION))
                    ? StringUtils.capitalize(wikiLink.getArticle())
                    : wikiLink.getArticle();
            if (StringUtils.equals(existingTopicName, importTopicName)) {
                // FIXME - update so that this merges any new versions instead of throwing an error
                WikiException e = new WikiException(new WikiMessage("import.error.topicexists", topicName));
                throw new SAXException(
                        "Topic " + this.virtualWiki + ':' + topicName + " already exists and cannot be imported",
                        e);
            }
        }
        this.currentTopic = new Topic(this.virtualWiki, wikiLink.getNamespace(), wikiLink.getArticle());
        this.currentTopic.setTopicType(WikiUtil.findTopicTypeForNamespace(wikiLink.getNamespace()));
    }

    /**
     * Write a topic version record to the database.
     */
    private void commitTopicVersion() throws SAXException {
        // FIXME - support rollback
        this.currentTopic.setTopicContent(currentTopicVersion.getVersionContent());
        // only the final import version is logged
        this.currentTopicVersion.setLoggable(false);
        // no recent change record needed - can be added by reloading all recent changes if desired
        this.currentTopicVersion.setRecentChangeAllowed(false);
        this.topicVersionBuffer.add(this.currentTopicVersion);
        this.writeTopicVersion(false);
    }

    /**
     * After all topic versions have been created for a topic, go back and set the previous topic version
     * ID values for each version.  This must be done after parsing since the XML file may not contain
     * version records sorted chronologically from oldest to newest.
     */
    private void orderTopicVersions() throws SAXException {
        if (this.currentTopicVersions.isEmpty()) {
            throw new SAXException("No topic versions found for " + this.currentTopic.getName());
        }
        // topic versions are stored in a tree map to allow sorting... convert to a list
        List<Integer> currentTopicVersionIdList = new ArrayList<Integer>(this.currentTopicVersions.values());
        try {
            WikiBase.getDataHandler().orderTopicVersions(this.currentTopic, currentTopicVersionIdList);
        } catch (DataAccessException e) {
            throw new SAXException(
                    "Failure while ordering topic versions for topic: " + this.currentTopic.getName(), e);
        }
        this.parsedTopics.put(this.currentTopic, currentTopicVersionIdList);
    }

    //===========================================================
    // SAX DocumentHandler methods
    //===========================================================

    /**
     * start of xml-tag
     *
     * @param uri The Namespace URI, or the empty string if the element has no Namespace URI or
     *  if Namespace processing is not being performed.
     * @param localName The local name (without prefix), or the empty string if Namespace processing
     *  is not being performed.
     * @param qName The qualified name (with prefix), or the empty string if qualified names are not available.
     * @param attrs The attributes attached to the element. If there are no attributes, it shall be an
     *  empty Attributes object.
     */
    public void startElement(String uri, String localName, String qName, Attributes attrs) throws SAXException {
        this.currentElementBuffer = new StringBuilder();
        this.currentAttributeMap = new HashMap<String, String>();
        String key;
        if (attrs != null) {
            // populate the attribute map
            for (int i = 0; i < attrs.getLength(); i++) {
                key = attrs.getQName(i);
                if (!StringUtils.isBlank(key)) {
                    this.currentAttributeMap.put(key, attrs.getValue(i));
                }
            }
        }
        if (MediaWikiConstants.MEDIAWIKI_ELEMENT_TOPIC_VERSION.equals(qName)) {
            this.currentTopicVersion = new TopicVersion();
            this.currentTopicVersion.setEditType(TopicVersion.EDIT_IMPORT);
        } else if (MediaWikiConstants.MEDIAWIKI_ELEMENT_TOPIC.equals(qName)) {
            this.currentTopicVersions = new TreeMap<Date, Integer>();
        }
    }

    /**
     * end of xml-tag
     *
     * @param uri The Namespace URI, or the empty string if the element has no Namespace URI or
     *  if Namespace processing is not being performed.
     * @param localName The local name (without prefix), or the empty string if Namespace processing
     *  is not being performed.
     * @param qName The qualified name (with prefix), or the empty string if qualified names are not available.
     */
    public void endElement(String uri, String localName, String qName) throws SAXException {
        if (StringUtils.equals(MediaWikiConstants.MEDIAWIKI_ELEMENT_NAMESPACE, qName)) {
            int key = NumberUtils.toInt(this.currentAttributeMap.get("key"));
            try {
                Namespace jamwikiNamespace = WikiBase.getDataHandler().lookupNamespaceById(key);
                if (jamwikiNamespace != null) {
                    String mediawikiNamespace = currentElementBuffer.toString().trim();
                    mediawikiNamespaceMap.put(mediawikiNamespace, jamwikiNamespace.getLabel(this.virtualWiki));
                }
            } catch (DataAccessException e) {
                throw new SAXException("Failure while processing namespace with ID: " + key, e);
            }
        } else if (MediaWikiConstants.MEDIAWIKI_ELEMENT_TOPIC_NAME.equals(qName)) {
            String topicName = currentElementBuffer.toString().trim();
            this.initCurrentTopic(topicName);
        } else if (MediaWikiConstants.MEDIAWIKI_ELEMENT_TOPIC_CONTENT.equals(qName)) {
            String topicContent = this.convertToJAMWikiNamespaces(currentElementBuffer.toString().trim());
            currentTopicVersion.setVersionContent(topicContent);
            currentTopicVersion.setCharactersChanged(StringUtils.length(topicContent) - previousTopicContentLength);
            previousTopicContentLength = StringUtils.length(topicContent);
        } else if (MediaWikiConstants.MEDIAWIKI_ELEMENT_TOPIC_VERSION_COMMENT.equals(qName)) {
            this.currentTopicVersion.setEditComment(currentElementBuffer.toString().trim());
        } else if (MediaWikiConstants.MEDIAWIKI_ELEMENT_TOPIC_VERSION_MINOR_EDIT.equals(qName)) {
            this.currentTopicVersion.setEditType(TopicVersion.EDIT_MINOR);
        } else if (MediaWikiConstants.MEDIAWIKI_ELEMENT_TOPIC_VERSION_EDIT_DATE.equals(qName)) {
            this.currentTopicVersion
                    .setEditDate(this.parseMediaWikiTimestamp(currentElementBuffer.toString().trim()));
        } else if (MediaWikiConstants.MEDIAWIKI_ELEMENT_TOPIC_VERSION_IP.equals(qName)
                || MediaWikiConstants.MEDIAWIKI_ELEMENT_TOPIC_VERSION_USERNAME.equals(qName)) {
            // Login name in Mediawiki can be longer than 100 characters, so trim to conform to
            // JAMWiki limits.  In general very long login names seem to be used only by vandals,
            // so this should be an acceptable workaround.
            String authorDisplay = currentElementBuffer.toString().trim();
            if (authorDisplay.length() > 100) {
                authorDisplay = authorDisplay.substring(0, 100);
            }
            this.currentTopicVersion.setAuthorDisplay(authorDisplay);
        } else if (MediaWikiConstants.MEDIAWIKI_ELEMENT_TOPIC_VERSION.equals(qName)) {
            this.commitTopicVersion();
        } else if (MediaWikiConstants.MEDIAWIKI_ELEMENT_TOPIC.equals(qName)) {
            // flush any pending topic version data
            this.writeTopicVersion(true);
            this.orderTopicVersions();
        }
    }

    /**
     * When the parser encounters plain text (not XML elements), it calls this method
     * which accumulates them in a string buffer
     */
    public void characters(char buf[], int offset, int len) throws SAXException {
        currentElementBuffer.append(buf, offset, len);
    }

    /**
     *
     */
    private void writeTopicVersion(boolean forceWrite) throws SAXException {
        try {
            // for performance reasons write the topic once to create an initial record, then write
            // only the version record.
            if (this.currentTopic.getTopicId() <= 0) {
                // metadata is needed only for the final import version, so for performance reasons
                // do not include category or link data for older versions
                WikiBase.getDataHandler().writeTopic(this.currentTopic, null, null, null);
            } else if (forceWrite || this.topicVersionBuffer.size() >= MAX_TOPIC_VERSION_BUFFER) {
                WikiBase.getDataHandler().writeTopicVersions(this.currentTopic, this.topicVersionBuffer);
                for (TopicVersion topicVersion : this.topicVersionBuffer) {
                    this.currentTopicVersions.put(topicVersion.getEditDate(), topicVersion.getTopicVersionId());
                }
                this.topicVersionBuffer = new ArrayList<TopicVersion>();
            }
        } catch (DataAccessException e) {
            throw new SAXException("Failure while writing topic: " + this.currentTopic.getName(), e);
        } catch (WikiException e) {
            throw new SAXException("Failure while writing topic: " + this.currentTopic.getName(), e);
        }
    }
}