org.snipsnap.util.XMLSnipRepair.java Source code

Introduction

Here is the source code for org.snipsnap.util.XMLSnipRepair.java
Source

/*
 * This file is part of "SnipSnap Wiki/Weblog".
 *
 * CopyAtright (c) 2002 Stephan J. Schmidt, Matthias L. Jugel
 * All Rights Reserved.
 *
 * Please visit http://snipsnap.org/ for updates and contact.
 *
 * --LICENSE NOTICE--
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 * --LICENSE NOTICE--
 */
package org.snipsnap.util;

import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.dom4j.io.OutputFormat;
import org.dom4j.io.SAXReader;
import org.dom4j.io.XMLWriter;

import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.StringReader;
import java.text.NumberFormat;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

/**
 * Repair XML File
 */
public class XMLSnipRepair {
    public static void main(String args[]) {
        if (args.length < 2) {
            System.err.println("usage: XMLSnipRepair <input file> <output file> [<webapp directory>]");
            System.exit(0);
        }

    }

    public static void repair(File input, File output, File webAppDir) {
        System.err.println("STEP 1: parsing input file ...");
        Document document = null;
        try {
            document = load(input);
        } catch (Exception e) {
            System.err.println("Unable to read input document: " + e);
            System.err.println(
                    "This is usually the case for illegal XML characters, please manually edit the file and remove them.");
            System.exit(0);
        }

        System.err.println("STEP 2: checking SnipSpace consistency ...");
        Document repaired = repair(document, webAppDir);

        System.err.println("STEP 3: writing output file ...");
        OutputFormat outputFormat = new OutputFormat();
        outputFormat.setEncoding("UTF-8");
        outputFormat.setNewlines(true);
        try {
            XMLWriter xmlWriter = new XMLWriter(
                    null == output ? System.out : (OutputStream) new FileOutputStream(output));
            xmlWriter.write(repaired);
            xmlWriter.flush();
            xmlWriter.close();
        } catch (Exception e) {
            System.err.println("Error: unable to write data: " + e);
        }
        System.err.println("Finished.");
    }

    static int errCount = 0;
    static int curr = 0;

    /**
     * Load snips and users into the SnipSpace from an xml document out of a stream.
     *
     * @param file the file to load from
     */
    private static Document load(File file) throws Exception {
        final long fileLength = file.length();
        SAXReader saxReader = new SAXReader();
        System.err.print("0%");
        InputStreamReader reader = new InputStreamReader(new FileInputStream(file), "UTF-8") {
            public int read(char[] chars) throws IOException {
                int n = super.read(chars);
                for (int i = 0; i < n; i++) {
                    chars[i] = replaceIfIllegal(chars[i]);
                }
                return n;
            }

            public int read(char[] chars, int start, int length) throws IOException {
                int n = super.read(chars, start, length);
                for (int i = 0; i < n; i++) {
                    chars[i] = replaceIfIllegal(chars[i]);
                }
                readProgress(fileLength, curr += n, length);
                return n;
            }

            private char replaceIfIllegal(char c) {
                if (c < 0x20 && !(c == 0x09 || c == 0x0a || c == 0x0d)) {
                    //          System.err.println("Replacing illegal character '0x" + Integer.toHexString(c) + "' by space.");
                    errCount++;
                    return (char) 0x20;
                }
                return c;
            }

            private void readProgress(long length, long current, int blockSize) {
                long percentage = current * 100 / length;
                if (percentage % 5 != 0 && ((current - blockSize) * 100 / length) % 5 == 0) {
                    System.err.print(".");
                } else if (percentage % 20 == 0 && ((current - blockSize) * 100 / length) % 20 != 0) {
                    System.err.print(NumberFormat.getIntegerInstance().format(percentage) + "%");
                }
            }
        };

        Document document = saxReader.read(reader);
        System.err.println();

        if (errCount > 0) {
            System.err.println("Replaced " + errCount + " illegal characters in input document by a space.");
            System.err.println("Characters not considered valid in an XML document are considered illegal.");
            System.err.println("This includes all characters with a code below 32 unless its TAB, CR or LF.");
        }

        return document;
    }

    private static Document repair(Document document, File webAppRoot) {
        Map userData = new TreeMap();
        Map snipData = new TreeMap();
        Map unknown = new TreeMap();

        Element rootEl = document.getRootElement();
        Iterator elementIt = rootEl.elementIterator();

        System.err.println("STEP 2.1: checking for duplicates ...");
        long identDup = 0;
        long oldDup = 0;
        long newDup = 0;
        while (elementIt.hasNext()) {
            Element element = (Element) elementIt.next();
            Element idElement = null;
            Map data = null;
            if ("user".equals(element.getName())) {
                idElement = element.element("login");
                data = userData;
            } else if ("snip".equals(element.getName())) {
                idElement = element.element("name");
                data = snipData;
            }

            if (null != data && null != idElement) {
                String id = element.getName() + "[" + idElement.getText() + "]";
                long mtime = Long.parseLong(element.element("mTime").getTextTrim());

                Element existingElement = (Element) data.get(id);
                if (existingElement != null) {
                    long lastmtime = Long.parseLong(existingElement.element("mTime").getTextTrim());
                    if (mtime > lastmtime) {
                        newDup++;
                        System.err.println(
                                "Replacing duplicate by newer element: " + id + " (" + (mtime - lastmtime) + "ms)");
                        data.put(id, element);
                    } else if (mtime == lastmtime) {
                        identDup++;
                        System.err.println("Identical duplicate found: " + id);
                    } else {
                        oldDup++;
                        System.err.println("Older duplicate found: " + id);
                    }
                    if (snipData == data) {
                        String name = idElement.getText();
                        if (name.startsWith("comment-") && name.lastIndexOf("-") != -1) {
                            String commentSnip = name.substring("comment-".length(), name.lastIndexOf("-"));
                            Element commentEl = element.element("commentSnip");
                            if (commentEl == null) {
                                commentEl = element.addElement("commentSnip");
                            }
                            //            System.out.println("commentSnip='" + commentSnip.toUpperCase() + "' commentEl='" + commentEl.getText().toUpperCase() + "'");
                            if (!commentSnip.toUpperCase().equals(commentEl.getText().toUpperCase())) {
                                commentEl.addText(commentSnip);
                                System.err
                                        .println("Fixing commented snip for '" + name + "' (" + commentSnip + ")");
                            }
                        } else if (name.matches("\\d\\d\\d\\d-\\d\\d-\\d\\d")) {
                            Element parentEl = element.element("parentSnip");
                            if (null == parentEl) {
                                parentEl = element.addElement("parentSnip");
                            }
                            if (!"start".equals(parentEl.getText())) {
                                parentEl.addText("start");
                                System.err.println("Fixing parent snip for '" + name + "'");
                            }
                        }
                    }
                } else {
                    data.put(id, element);
                }
            } else {
                System.err.println("Unknown element '" + element.getName() + "', ignoring ...");
                unknown.put(element, element);
            }
        }

        System.err.println(
                "Found " + identDup + " identical duplicates, replaced " + newDup + ", ignored " + oldDup + ".");
        if (unknown.size() > 0) {
            System.err.println("Found " + unknown.size() + " unknown xml elements.");
        }

        Document outputDocument = DocumentHelper.createDocument();
        outputDocument.addElement(rootEl.getName());
        rootEl = outputDocument.getRootElement();

        System.err.println("STEP 2.2: finishing user data (" + userData.size() + ")...");
        Iterator userIt = userData.values().iterator();
        while (userIt.hasNext()) {
            Element userEl = (Element) userIt.next();
            rootEl.add(userEl.detach());
        }

        int attCount = 0;
        System.err.print("STEP 2.3: fixing snip data (" + snipData.size() + ")");
        if (webAppRoot != null) {
            System.out.println(" and attachments ...");
        } else {
            System.out.println();
        }
        Iterator snipIt = snipData.values().iterator();
        while (snipIt.hasNext()) {
            Element snipEl = (Element) snipIt.next();
            if (webAppRoot != null) {
                attCount += storeAttachments(snipEl, new File(webAppRoot, "/WEB-INF/files"));
                attCount += storeOldImages(snipEl, new File(webAppRoot, "/images"));
            }
            rootEl.add(snipEl.detach());
        }
        System.err.println("Added " + attCount + " attachments.");
        return outputDocument;
    }

    private static int storeOldImages(Element snipEl, File imageRoot) {
        int attCount = 0;
        final String snipName = snipEl.element("name").getText();
        File[] files = imageRoot.listFiles(new FilenameFilter() {
            public boolean accept(File file, String s) {
                return s.startsWith("image-" + snipName);
            }
        });

        Element attachmentsEl = snipEl.element("attachments");
        if (null == attachmentsEl) {
            attachmentsEl = DocumentHelper.createElement("attachments");
            snipEl.add(attachmentsEl);
        }

        Set attList = new HashSet();
        Iterator attIt = attachmentsEl.elementIterator("attachment");
        while (attIt.hasNext()) {
            Element attEl = (Element) attIt.next();
            if (attEl != null && attEl.element("name") != null) {
                attList.add(attEl.element("name").getText());
            }
        }

        for (int n = 0; n < files.length; n++) {
            File file = files[n];
            String fileName = file.getName().substring(("image-" + snipName + "-").length());
            if (!attList.contains(fileName)) {
                Element attEl = attachmentsEl.addElement("attachment");
                attEl.addElement("name").addText(fileName);
                attEl.addElement("content-type")
                        .addText("image/" + fileName.substring(fileName.lastIndexOf(".") + 1));
                attEl.addElement("size").addText("" + file.length());
                attEl.addElement("date").addText("" + file.lastModified());
                attEl.addElement("location").addText(snipName + "/" + fileName);
                try {
                    addAttachmentFile(attEl, file);
                    attCount++;
                } catch (IOException e) {
                    System.err.println("Error adding attachment data: " + e.getMessage());
                    attEl.detach();
                }
                System.err.println("Added old image attachment '" + fileName + "' to '" + snipName + "'");
            }
        }
        return attCount;
    }

    private static int storeAttachments(Element snipEl, File attRoot) {
        Element attachmentsEl = snipEl.element("attachments");
        attachmentsEl.detach();
        String textContent = attachmentsEl.getText();
        if (textContent != null && textContent.length() > 0 && attachmentsEl.elements("attachment").size() == 0) {
            SAXReader saxReader = new SAXReader();
            try {
                attachmentsEl = saxReader.read(new StringReader("<attachments>" + textContent + "</attachments>"))
                        .getRootElement();
            } catch (DocumentException e) {
                System.err.println("Error parsing the attachments ...: " + e.getMessage());
            }
        }

        int attCount = 0;
        Iterator attIt = attachmentsEl.elements("attachment").iterator();
        while (attIt.hasNext()) {
            Element att = (Element) attIt.next();
            File file = new File(attRoot, att.elementText("location"));
            String snipName = snipEl.element("name").getText();
            if (att.element("data") == null) {
                if (file.exists()) {
                    try {
                        addAttachmentFile(att, file);
                        attCount++;
                        //          System.err.println("Added '" + file.getPath() + "' to " + snipName);
                    } catch (Exception e) {
                        System.err.println("Error adding '" + file.getPath() + "' to '" + snipName + "'");
                        e.printStackTrace();
                        att.detach();
                    }
                } else {
                    System.err.println("Missing file '" + file.getPath() + "' attached to '" + snipName + "'");
                    att.detach();
                }
            }
        }
        snipEl.add(attachmentsEl);
        return attCount;
    }

    public static void addAttachmentFile(Element att, File attFile) throws IOException {
        ByteArrayOutputStream data = new ByteArrayOutputStream();
        BufferedInputStream fileIs = new BufferedInputStream(new FileInputStream(attFile));
        int count = 0;
        byte[] buffer = new byte[8192];
        while ((count = fileIs.read(buffer)) != -1) {
            data.write(buffer, 0, count);
        }
        data.close();
        att.addElement("data").addText(
                new String(org.apache.commons.codec.binary.Base64.encodeBase64(data.toByteArray()), "UTF-8"));
    }

}