org.jamwiki.utils.XMLTopicFactory.java Source code

Java tutorial

Introduction

Here is the source code for org.jamwiki.utils.XMLTopicFactory.java

Source

/**
 * Licensed under the GNU LESSER GENERAL PUBLIC LICENSE, version 2.1, dated February 1999.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the latest version of the GNU Lesser General
 * Public License as published by the Free Software Foundation;
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program (LICENSE.txt); if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package org.jamwiki.utils;

import java.io.File;
import java.util.Hashtable;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.parsers.SAXParser;
import org.apache.commons.lang.StringUtils;
import org.jamwiki.WikiBase;
import org.jamwiki.model.Topic;
import org.jamwiki.model.TopicVersion;
import org.jamwiki.model.WikiUser;
import org.jamwiki.parser.ParserUtil;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/**
 * The purpose of this class is to load MediaWiki XML-file to the JAMWiki.
 */
public class XMLTopicFactory extends DefaultHandler {

    /** Amount to indent */
    private static final String XML_INDENT = "    ";

    private final WikiUser user;
    private final String authorIpAddress;
    private int indentLevel = 0;
    String virtualWiki = "en";
    Hashtable namespaces = new Hashtable();
    String ns14 = "Category";
    String ns6 = "Image";
    Integer nsKey = null;
    String nsVal = null;
    StringBuffer lastStr = null;
    String pageName = null;
    String pageText = null;
    private String processedTopicName = null;
    private static String lineEnd = System.getProperty("line.separator");
    private static final WikiLogger logger = WikiLogger.getLogger(XMLTopicFactory.class.getName());

    /**
     *
     */
    public XMLTopicFactory(String virtualWiki, WikiUser user, String authorIpAddress) {
        this.virtualWiki = virtualWiki;
        this.authorIpAddress = authorIpAddress;
        this.user = user;
    }

    /**
     *
     */
    public String importWikiXml(File file) throws Exception {
        //read ini params from file
        // TODO read all params from JAMWiki properties
        //importProps = Environment.loadProperties(PROPERTY_FILE_NAME);
        //For big file parsing
        System.setProperty("entityExpansionLimit", "1000000");
        // Use an instance of ourselves as the SAX event handler
        // DefaultHandler handler = new XMLPageFactory();
        // Use the default (non-validating) parser
        SAXParserFactory factory = SAXParserFactory.newInstance();
        try {
            // Parse the input file
            SAXParser saxParser = factory.newSAXParser();
            saxParser.parse(file, this);
        } catch (Throwable t) {
            logger.severe("Error by importing " + ((XMLTopicFactory) this).pageName, t);
            throw new Exception("Error by import: " + t.getMessage(), t);
        }
        return this.processedTopicName;
    }

    //===========================================================
    // SAX DocumentHandler methods
    //===========================================================

    /**
     *
     */
    public void startDocument() throws SAXException {
        nl();
        nl();
        emit("START DOCUMENT");
        nl();
        emit("<?xml version='1.0' encoding='UTF-8'?>");
    }

    /**
     *
     */
    public void endDocument() throws SAXException {
        nl();
        emit("END DOCUMENT");
        nl();
    }

    /**
     * start of xml-tag
     *
     * @param lName Local name.
     * @param qName Qualified name.
     */
    public void startElement(String namespaceURI, String lName, String qName, Attributes attrs)
            throws SAXException {
        indentLevel++;
        nl();
        emit("ELEMENT: ");
        String eName = lName;
        if ("".equals(eName)) {
            eName = qName;
        }
        emit("<" + eName);
        lastStr = new StringBuffer();
        if (attrs != null) {
            for (int i = 0; i < attrs.getLength(); i++) {
                String aName = attrs.getLocalName(i); // Attr name
                if ("".equals(aName)) {
                    aName = attrs.getQName(i);
                }
                nl();
                emit("   ATTR: ");
                emit(aName);
                emit("\t\"");
                emit(attrs.getValue(i));
                emit("\"");
            }
        }
        if (attrs.getLength() > 0) {
            nl();
        }
        emit(">");
        if ("namespace".equals(eName)) { // mapping of namespaces from imported file
            nsKey = new Integer(attrs.getValue("key"));
        }
        if ("page".equals(eName)) {
            pageName = "";
            pageText = "";
        }
    }

    /**
     * end of xml-tag
     *
     * @param sName Simple name.
     * @param qName Qualified name.
     */
    public void endElement(String namespaceURI, String sName, String qName) throws SAXException {
        nl();
        emit("END_ELM: ");
        emit("</" + sName + ">");
        if ("namespace".equals(qName)) { // mapping of namespaces from imported file
            namespaces.put(lastStr.toString().trim(), nsKey);
            //Prepare locale namespaces
            //WikiArticle.addNamespace(nsKey.intValue(), lastStr.trim());
            if (nsKey.intValue() == 14) {
                ns14 = lastStr.toString().trim();
            }
            if (nsKey.intValue() == 6) {
                ns6 = lastStr.toString().trim();
            }
        }
        if ("title".equals(qName)) {
            pageName = lastStr.toString().trim();
        }
        if ("text".equals(qName)) {
            pageText = lastStr.toString().trim();
        }
        if ("page".equals(qName)) {
            //Create Topic
            String sNamespace = "";
            int namespace = 0;
            // get wiki namespace
            int pos = pageName.indexOf(':');
            if (pos > -1) {
                sNamespace = pageName.substring(0, pos);
                if (namespaces.containsKey(sNamespace)) {
                    namespace = ((Integer) namespaces.get(sNamespace)).intValue();
                } else { // unknown namespace
                    namespace = -1;
                }
            } else { // main namespace
                namespace = 0;
            }
            // preprocess text of topic to fit JAMWiki
            pageText = preprocessText(pageText);
            Topic topic = new Topic();
            topic.setName(convertArticleNameFromWikipediaToJAMWiki(pageName));
            topic.setVirtualWiki(virtualWiki);
            topic.setTopicContent(pageText);
            TopicVersion topicVersion = new TopicVersion(user, authorIpAddress, "imported", pageText);
            // manage mapping bitween MediaWiki and JAMWiki namespaces
            topic.setTopicType(convertNamespaceFromMediaWikiToJAMWiki(namespace));
            // Store topic in database
            try {
                WikiBase.getDataHandler().writeTopic(topic, topicVersion,
                        ParserUtil.parserDocument(pageText, virtualWiki, pageName), true, null);
                this.processedTopicName = topic.getName();
            } catch (Exception e) {
                throw new SAXException(e);
            }
        }
        indentLevel--;
    }

    /**
     *
     */
    public void characters(char buf[], int offset, int len) throws SAXException {
        lastStr.append(buf, offset, len);
    }

    /**
     * Wrap I/O exceptions in SAX exceptions, to suit handler signature requirements.
     */
    private void emit(String s) throws SAXException {
        logger.fine(s);
    }

    /**
     * Start a new line and indent the next line appropriately.
     */
    private void nl() throws SAXException {
        logger.fine(lineEnd);
        for (int i = 0; i < indentLevel; i++) {
            logger.fine(XML_INDENT);
        }
    }

    /**
     * convert MediaWiki namespace-id to JAMWiki namespace-id
     * @param mediaWikiNamespaceId
     * @return
     */
    private int convertNamespaceFromMediaWikiToJAMWiki(int mediaWikiNamespaceId) {
        int ret = -1;
        switch (mediaWikiNamespaceId) {
        case 0:
            ret = Topic.TYPE_ARTICLE;
            break;
        //case 0: ret = Topic.TYPE_REDIRECT; break; //special hendling for redirects
        case 6:
            ret = Topic.TYPE_IMAGE;
            break;
        case 14:
            ret = Topic.TYPE_CATEGORY;
            break;
        //case 0: ret = Topic.TYPE_FILE; break;
        //case 0: ret = Topic.TYPE_SYSTEM_FILE; break;
        case 10:
            ret = Topic.TYPE_TEMPLATE;
            break;
        }
        return ret;
    }

    /**
     *
     */
    private String getJAMWikiNamespaceById(int jamWikiNamespaceId) {
        String ret = "";
        switch (jamWikiNamespaceId) {
        case Topic.TYPE_IMAGE:
            ret = NamespaceHandler.NAMESPACE_IMAGE;
            break;
        case Topic.TYPE_CATEGORY:
            ret = NamespaceHandler.NAMESPACE_CATEGORY;
            break;
        case Topic.TYPE_TEMPLATE:
            ret = NamespaceHandler.NAMESPACE_TEMPLATE;
            break;
        }
        return ret;
    }

    /**
     *
     */
    private String convertArticleNameFromWikipediaToJAMWiki(String fullName) {
        String ret = fullName;
        String sNamespace = "";
        String sJAMNamespace = "";
        String sTitle = pageName;
        int pos = pageName.indexOf(':');
        if (pos > -1) {
            sNamespace = pageName.substring(0, pos);
            if (namespaces.containsKey(sNamespace)) {
                int namespace = ((Integer) namespaces.get(sNamespace)).intValue();
                sTitle = pageName.substring(pos + 1);
                sJAMNamespace = getJAMWikiNamespaceById(convertNamespaceFromMediaWikiToJAMWiki(namespace));
                if (sJAMNamespace.length() > 0) {
                    ret = sJAMNamespace + ":" + sTitle;
                } else {//equivalent namespace in JAMWiki not found. Use original name
                    ret = sNamespace + ":" + sTitle;
                }
            } else { //namespace not found
                ret = pageName;
            }
        } else { //main namespace
            ret = pageName;
        }
        return ret;
    }

    /**
     * preprocess the text of topic
     * convert all namespaces names from MediaWiki to JAMWiki local representation
     * and so on...
     */
    public String preprocessText(String text) {
        String ret = text;
        // convert all namespaces names from MediaWiki to JAMWiki local representation
        ret = StringUtils.replace(ret, "[[category:", "[[" + NamespaceHandler.NAMESPACE_CATEGORY + ":");
        if (!"Category".equals(NamespaceHandler.NAMESPACE_CATEGORY)) {
            ret = StringUtils.replace(ret, "[[Category:", "[[" + NamespaceHandler.NAMESPACE_CATEGORY + ":");
        }
        ret = StringUtils.replace(ret, "[[" + ns14 + ":", "[[" + NamespaceHandler.NAMESPACE_CATEGORY + ":");
        ret = StringUtils.replace(ret, "[[image:", "[[" + NamespaceHandler.NAMESPACE_IMAGE + ":");
        if (!"Image".equals(NamespaceHandler.NAMESPACE_CATEGORY)) {
            ret = StringUtils.replace(ret, "[[Image:", "[[" + NamespaceHandler.NAMESPACE_IMAGE + ":");
        }
        ret = StringUtils.replace(ret, "[[" + ns6 + ":", "[[" + NamespaceHandler.NAMESPACE_IMAGE + ":");

        return ret;
    }
}