eu.delving.sip.xml.SourceConverter.java Source code

Java tutorial

Introduction

Here is the source code for eu.delving.sip.xml.SourceConverter.java

Source

/*
 * Copyright 2011, 2012 Delving BV
 *
 * Licensed under the EUPL, Version 1.0 or? as soon they
 * will be approved by the European Commission - subsequent
 * versions of the EUPL (the "Licence");
 * you may not use this work except in compliance with the
 * Licence.
 * You may obtain a copy of the Licence at:
 *
 * http://ec.europa.eu/idabc/eupl
 *
 * Unless required by applicable law or agreed to in
 * writing, software distributed under the Licence is
 * distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied.
 * See the Licence for the specific language governing
 * permissions and limitations under the Licence.
 */

package eu.delving.sip.xml;

import eu.delving.XMLToolFactory;
import eu.delving.metadata.Hasher;
import eu.delving.metadata.Path;
import eu.delving.metadata.Tag;
import eu.delving.sip.base.CancelException;
import eu.delving.sip.base.ProgressListener;
import eu.delving.sip.base.Work;
import eu.delving.sip.files.DataSet;
import eu.delving.sip.files.Storage;
import eu.delving.sip.files.StorageException;
import eu.delving.stats.Stats;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;

import javax.xml.namespace.QName;
import javax.xml.stream.*;
import javax.xml.stream.events.*;
import javax.xml.transform.stream.StreamSource;
import java.io.*;
import java.security.DigestOutputStream;
import java.util.*;
import java.util.regex.Pattern;

import static eu.delving.sip.files.Storage.*;
import static eu.delving.sip.files.Storage.FileType.SOURCE;
import static eu.delving.sip.files.StorageHelper.*;
import static org.apache.commons.io.FileUtils.deleteQuietly;
import static org.apache.commons.io.FileUtils.moveFile;

/**
 * Create an output file with our standard record wrapping from a file of otherwise wrapped records, given by
 * the recordRootPath.  There is a potential regex-based conversion of the ID values on the way.  The uniqueness
 * of the ID is checked.
 *
 * @author Gerald de Jong <gerald@delving.eu>
 */

public class SourceConverter implements Work.DataSetWork, Work.LongTermWork {
    public static final String CONVERTER_DELIMITER = ":::";
    //    public static final String ANONYMOUS_RECORDS_PROPERTY = "anonymousRecords";
    private static final String XSI_SCHEMA = "http://www.w3.org/2001/XMLSchema-instance";
    private static final Pattern TO_UNDERSCORE = Pattern.compile("[:]");
    private XMLInputFactory inputFactory = XMLToolFactory.xmlInputFactory();
    private XMLOutputFactory outputFactory = XMLToolFactory.xmlOutputFactory();
    private XMLEventFactory eventFactory = XMLToolFactory.xmlEventFactory();
    private Path recordRootPath;
    private Path uniqueElementPath;
    private int recordCount, totalRecords;//, anonymousRecords;
    private ProgressListener progressListener;
    private String unique;
    private StartElement start;
    private boolean recordEvents;
    private List<XMLEvent> eventBuffer = new ArrayList<XMLEvent>();
    private List<String> lines = new ArrayList<String>();
    private boolean finished = false;
    private Set<String> uniqueness = new HashSet<String>();
    private Pattern converterPattern;
    private String converterReplacement;
    private int uniqueRepeatCount;
    private int maxUniqueValueLength;
    private DataSet dataSet;
    private Runnable work;

    public SourceConverter(DataSet dataSet, Runnable finished) {
        this.dataSet = dataSet;
        this.work = finished;
    }

    @Override
    public DataSet getDataSet() {
        return dataSet;
    }

    @Override
    public Job getJob() {
        return Job.CONVERT_SOURCE;
    }

    @Override
    public void run() {
        try {
            interpretHints();
            if (!dataSet.isRecentlyImported()) {
                throw new StorageException("Import to source would be redundant, since source is newer");
            }
            Stats stats = dataSet.getStats(false, null);
            if (stats == null) {
                throw new StorageException("No analysis stats so conversion doesn't trust the record count");
            }
            File parent = dataSet.sourceOutput().getParentFile();
            dataSet.deleteSource(); // clean out before the new import
            deleteQuietly(statsFile(parent, true, null));
            Hasher hasher = new Hasher();
            DigestOutputStream digestOut = hasher.createDigestOutputStream(zipOut(dataSet.sourceOutput()));
            parse(dataSet.openImportedInputStream(), digestOut, stats.namespaces); // streams closed within parse()
            File hashedSource = new File(parent, hasher.prefixFileName(SOURCE.getName()));
            try {
                moveFile(dataSet.sourceOutput(), hashedSource);
            } catch (IOException e) {
                throw new StorageException("Unable to move file " + dataSet.sourceOutput().getAbsolutePath(), e);
            }
        } catch (StorageException e) {
            deleteQuietly(dataSet.sourceOutput());
            progressListener.getFeedback().alert("Conversion failed: " + e.getMessage(), e);
        } finally {
            if (work != null)
                work.run();
        }
    }

    public void interpretHints() throws StorageException {
        Map<String, String> hints = dataSet.getHints();
        this.recordRootPath = getRecordRoot(hints);
        this.totalRecords = getRecordCount(hints);
        this.uniqueElementPath = getUniqueElement(hints);
        this.maxUniqueValueLength = getMaxUniqueValueLength(hints);
        String uniqueConverter = getUniqueValueConverter(hints);
        if (uniqueConverter != null) {
            int divider = uniqueConverter.indexOf(CONVERTER_DELIMITER);
            if (divider > 0) {
                converterPattern = Pattern.compile(uniqueConverter.substring(0, divider));
                converterReplacement = uniqueConverter.substring(divider + CONVERTER_DELIMITER.length());
            }
        }
    }

    @Override
    public void setProgressListener(ProgressListener progressListener) {
        this.progressListener = progressListener;
        this.progressListener.setProgressMessage("Converting to standard form");
    }

    public void parse(InputStream inputStream, OutputStream outputStream, Map<String, String> namespaces)
            throws StorageException {
        progressListener.prepareFor(totalRecords);
        //        anonymousRecords = Integer.parseInt(System.getProperty(ANONYMOUS_RECORDS_PROPERTY, "0"));
        Path path = Path.create();
        try {
            XMLEventReader in = inputFactory.createXMLEventReader(new StreamSource(inputStream, "UTF-8"));
            XMLEventWriter out = outputFactory.createXMLEventWriter(new OutputStreamWriter(outputStream, "UTF-8"));
            processEvents: while (!finished) {
                XMLEvent event = in.nextEvent();
                switch (event.getEventType()) {
                case XMLEvent.START_DOCUMENT:
                    out.add(eventFactory.createStartDocument());
                    out.add(eventFactory.createCharacters("\n"));
                    List<Namespace> nslist = new ArrayList<Namespace>();
                    for (Map.Entry<String, String> entry : namespaces.entrySet()) {
                        if (entry.getKey().isEmpty() || entry.getValue().trim().isEmpty())
                            continue;
                        if (XSI_SCHEMA.equals(entry.getValue()))
                            continue;
                        nslist.add(eventFactory.createNamespace(entry.getKey(), entry.getValue()));
                    }
                    out.add(eventFactory.createStartElement("", "", ENVELOPE_TAG, null, nslist.iterator()));
                    out.add(eventFactory.createCharacters("\n"));
                    break;
                case XMLEvent.START_ELEMENT:
                    boolean followsStart = start != null;
                    start = event.asStartElement();
                    if (linesAvailable()) {
                        eventBuffer.add(eventFactory.createCharacters("\n"));
                        eventBuffer.add(eventFactory.createStartElement("", "", TEXT_TAG, null, null));
                        Iterator<String> walk = lines.iterator();
                        while (walk.hasNext()) {
                            eventBuffer.add(eventFactory.createCharacters(walk.next()));
                            if (walk.hasNext()) {
                                eventBuffer.add(eventFactory.createEndElement("", "", TEXT_TAG));
                                eventBuffer.add(eventFactory.createCharacters("\n"));
                                eventBuffer.add(eventFactory.createStartElement("", "", TEXT_TAG, null, null));
                            }
                        }
                        eventBuffer.add(eventFactory.createEndElement("", "", TEXT_TAG));
                        lines.clear();
                    }
                    path = path.child(Tag.element(start.getName()));
                    handleStartElement(path, followsStart);
                    progressListener.setProgress(recordCount);
                    break;
                case XMLEvent.END_ELEMENT:
                    EndElement end = event.asEndElement();
                    if (recordEvents) {
                        if (path.equals(recordRootPath)) {
                            if (unique != null) {
                                outputRecord(out);
                                recordCount++;
                            } else {
                                clearEvents();
                            }
                        } else {
                            if (!uniqueElementPath.peek().isAttribute() && path.equals(uniqueElementPath)) {
                                unique = StringUtils.join(lines, "");
                            }
                            boolean addEndTag = true;
                            if (linesAvailable()) {
                                Iterator<String> walk = lines.iterator();
                                while (walk.hasNext()) {
                                    eventBuffer.add(eventFactory.createCharacters(walk.next()));
                                    if (walk.hasNext()) {
                                        eventBuffer.add(end);
                                        eventBuffer.add(eventFactory.createCharacters("\n"));
                                        handleStartElement(path, false);
                                    }
                                }
                            } else {
                                if (eventBuffer.get(eventBuffer.size() - 1).isStartElement()) {
                                    eventBuffer.remove(eventBuffer.size() - 1); // remove the start event
                                    addEndTag = false;
                                }
                            }
                            lines.clear();
                            if (addEndTag) {
                                eventBuffer.add(end);
                                eventBuffer.add(eventFactory.createCharacters("\n"));
                            }
                        }
                    }
                    start = null;
                    path = path.parent();
                    break;
                case XMLEvent.END_DOCUMENT:
                    out.add(eventFactory.createEndElement("", "", ENVELOPE_TAG));
                    out.add(eventFactory.createCharacters("\n"));
                    out.add(eventFactory.createEndDocument());
                    out.flush();
                    break processEvents;
                case XMLEvent.CHARACTERS:
                case XMLEvent.CDATA:
                    if (recordEvents)
                        extractLines(event.asCharacters().getData());
                    break;
                }
            }
        } catch (CancelException e) {
            throw new StorageException("Cancelled", e);
        } catch (StorageException e) {
            throw e;
        } catch (Exception e) {
            throw new StorageException("Storage problem", e);
        } finally {
            if (uniqueRepeatCount > 0) {
                progressListener.getFeedback().alert(String.format("Uniqueness violations : " + uniqueRepeatCount));
            }
            IOUtils.closeQuietly(inputStream);
            IOUtils.closeQuietly(outputStream);
        }
    }

    private boolean linesAvailable() {
        Iterator<String> iterator = lines.iterator();
        while (iterator.hasNext()) {
            String line = iterator.next();
            if (line.trim().isEmpty())
                iterator.remove();
        }
        return !lines.isEmpty();
    }

    private void outputRecord(XMLEventWriter out) throws StorageException {
        //        if (anonymousRecords == 0 || recordCount < anonymousRecords) {
        String uniqueValue = getUniqueValue();
        if (!uniqueValue.isEmpty()) {
            if (uniqueness.contains(uniqueValue)) {
                uniqueRepeatCount++;
            } else {
                uniqueness.add(uniqueValue);
                Attribute id = eventFactory.createAttribute(Storage.UNIQUE_ATTR, uniqueValue);
                unique = null;
                List<Attribute> attrs = new ArrayList<Attribute>();
                attrs.add(id);
                try {
                    out.add(eventFactory.createStartElement("", "", RECORD_TAG, attrs.iterator(), null));
                    for (XMLEvent bufferedEvent : eventBuffer)
                        out.add(bufferedEvent);
                    out.add(eventFactory.createEndElement("", "", RECORD_TAG));
                    out.add(eventFactory.createCharacters("\n"));
                } catch (XMLStreamException e) {
                    throw new StorageException("Problem writing XML", e);
                }
            }
        }
        //        }
        clearEvents();
    }

    private String getUniqueValue() throws StorageException {
        String trimmed = unique.trim().replaceAll(":", "-");
        String modified = converterPattern != null
                ? converterPattern.matcher(trimmed).replaceFirst(converterReplacement)
                : trimmed;
        if (modified.length() > maxUniqueValueLength) {
            throw new StorageException("Unique value too large: " + unique);
        }
        return TO_UNDERSCORE.matcher(modified).replaceAll("_");
    }

    private void handleStartElement(Path path, boolean followsStart) {
        if (recordEvents) {
            if (followsStart)
                eventBuffer.add(eventFactory.createCharacters("\n"));
            if (uniqueElementPath.peek().isAttribute())
                handleAttributes(path, start);
            eventBuffer.add(eventFactory.createStartElement(start.getName(), filteredAttributes(), null)); // remove namespaces
        } else if (path.equals(recordRootPath)) {
            recordEvents = true;
            if (uniqueElementPath.peek().isAttribute())
                handleAttributes(path, start);
        }
    }

    private Iterator<Attribute> filteredAttributes() {
        Iterator aw = start.getAttributes();
        List<Attribute> attributes = new ArrayList<Attribute>();
        while (aw.hasNext()) {
            Attribute attribute = (Attribute) aw.next();
            if (XSI_SCHEMA.equals(attribute.getName().getNamespaceURI()))
                continue;
            attributes.add(attribute);
        }
        return attributes.iterator();
    }

    private void handleAttributes(Path path, StartElement start) {
        Iterator attrWalk = start.getAttributes();
        while (attrWalk.hasNext())
            handleAttribute(path, (Attribute) attrWalk.next());
    }

    private void handleAttribute(Path path, Attribute attr) {
        Path extended = path.child(Tag.attribute(attr.getName()));
        if (extended.equals(uniqueElementPath)) {
            unique = attr.getValue();
        } else if (path.equals(recordRootPath)) {
            QName a = attr.getName();
            QName attrName = new QName(a.getNamespaceURI(), "_" + a.getLocalPart(), a.getPrefix());
            eventBuffer.add(eventFactory.createStartElement(attrName, null, null));
            eventBuffer.add(eventFactory.createCharacters(attr.getValue()));
            eventBuffer.add(eventFactory.createEndElement(attrName, null));
            eventBuffer.add(eventFactory.createCharacters("\n"));
        }
    }

    private void extractLines(String value) {
        for (String line : value.split(" *[\n\r]+ *")) {
            if (line.isEmpty())
                continue;
            StringBuilder out = new StringBuilder(line.length());
            for (char c : line.toCharArray())
                out.append(Character.isWhitespace(c) ? ' ' : c);
            String clean = out.toString().replaceAll(" +", " ").trim();
            //            if (anonymousRecords > 0) clean = anonymize(clean);
            if (!clean.isEmpty())
                lines.add(clean);
        }
    }

    private String anonymize(String string) {
        if (string.startsWith("http")) { // preserve the beginning and the end
            int slashSlash = string.indexOf("//");
            int nextSlash = string.indexOf("/", slashSlash + 1);
            int finalSlash = string.lastIndexOf("/");
            if (slashSlash > 0 && nextSlash > 0 && finalSlash > 0) {
                return String.format("%s%s%s%s", string.substring(0, slashSlash),
                        anonymizeString(string.substring(slashSlash, nextSlash)),
                        anonymizeString(string.substring(nextSlash, finalSlash)), string.substring(finalSlash));
            }
        }
        return anonymizeString(string);
    }

    private String anonymizeString(String string) {
        if (moreNumbersThanLetters(string))
            return string;
        StringBuilder out = new StringBuilder(string.length());
        Random random = new Random(string.hashCode());
        for (char c : string.toCharArray()) {
            if (Character.isLowerCase(c)) {
                out.append((char) ('a' + (Math.abs(random.nextInt()) % 26)));
            } else if (Character.isUpperCase(c)) {
                out.append((char) ('A' + (Math.abs(random.nextInt()) % 26)));
            } else {
                out.append(c);
            }
        }
        return out.toString();
    }

    private boolean moreNumbersThanLetters(String string) {
        int letters = 0, numbers = 0;
        for (char c : string.toCharArray()) {
            if (Character.isLetter(c)) {
                letters++;
            } else if (Character.isDigit(c)) {
                numbers++;
            }
        }
        return numbers > letters;
    }

    private void clearEvents() {
        recordEvents = false;
        eventBuffer.clear();
    }

}