eu.scape_project.planning.xml.C3POProfileParser.java Source code

Java tutorial

Introduction

Here is the source code for eu.scape_project.planning.xml.C3POProfileParser.java

Source

/*******************************************************************************
 * Copyright 2006 - 2012 Vienna University of Technology,
 * Department of Software Technology and Interactive Systems, IFS
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *    http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and limitations under the License.
 ******************************************************************************/
package eu.scape_project.planning.xml;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;

import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.Namespace;
import org.dom4j.QName;
import org.dom4j.io.SAXReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ErrorHandler;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;

import eu.scape_project.planning.model.FormatInfo;
import eu.scape_project.planning.model.SampleObject;
import eu.scape_project.planning.utils.ParserException;

/**
 * A simple parser for the c3po profile.
 * 
 * @author Petar Petrov - <me@petarpetrov.org>
 * 
 */
public class C3POProfileParser {

    /**
     * The default namespace of a c3po profile.
     */
    private static final String C3PO_NAMESPACE = "http://ifs.tuwien.ac.at/dp/c3po";

    /**
     * A template for the description of the partition.
     */
    private static final String TYPE_OF_OBJECTS_BEGIN = "The collection consists of {1}% '{2}' files. ";

    /**
     * A template for the second most prominent type of objects in the
     * partition.
     */
    private static final String TYPE_OF_OBJECTS_SECOND = "It also contains {1}% '{2}' files. ";

    /**
     * A template for the conflicting objects in the partition.
     */
    private static final String TYPE_OF_OBJECTS_CONFLICTS = "{1}% files have conflicts. ";

    /**
     * A template for the unknown objects in the partition.
     */
    private static final String TYPE_OF_OBJECTS_UNKNOWN = "{1}% files have an unknown format. ";

    /**
     * A constant if the format distribution is missing.
     */
    private static final String MISSING = "No format distribution provided";

    /**
     * A template for the 'description of objects' field.
     */
    private static final String DESCRIPTION_OF_SAMPLES_TEMPLATE = "The sample objects were chosen by c3po using the {1} algorithm.";

    private static final Logger log = LoggerFactory.getLogger(C3POProfileParser.class);

    private Document profile;

    private Namespace namespace;

    /**
     * Reads the profile out of the input stream and validates if needed. If the
     * document is faulty for some reason and exception will be thrown.
     * 
     * @param stream
     *            the stream of the c3po xml profile to read.
     * @param validate
     *            whether or not to validate the xml.
     * @throws ParserException
     *             if some error occurrs.
     */
    public void read(final InputStream stream, boolean validate) throws ParserException {
        ValidatingParserFactory vpf = new ValidatingParserFactory();
        SAXParser parser = null;
        try {
            parser = vpf.getValidatingParser();
        } catch (ParserConfigurationException e) {
            log.error("An error occurred while parsing the c3po profile: {}", e.getMessage());
        } catch (SAXException e) {
            log.error("An error occurred while parsing the c3po profile: {}", e.getMessage());
        }

        if (validate && !this.isValid(parser, stream)) {
            throw new ParserException(
                    "Validation was turned on, but the xml file is not valid against the schema.");
        }

        try {
            final SAXReader reader = new SAXReader();
            this.profile = reader.read(stream);

            final Namespace namespace = this.profile.getRootElement().getNamespace();
            if (!namespace.getStringValue().equals(C3PO_NAMESPACE)) {
                throw new ParserException("Cannot parse the profile, namespace does not match");
            }
        } catch (final DocumentException e) {
            log.error("An error occurred while reading the profile: {}", e.getMessage());
            this.profile = null;
        }

        try {
            stream.close();
        } catch (final IOException e) {
            log.error("An error occurred while closing the input stream: {}", e.getMessage());
        }
    }

    /**
     * Gets the collection identifier.
     * 
     * @return the id.
     */
    public String getCollectionId() {
        return this.profile.getRootElement().attributeValue("collection");
    }

    /**
     * Gets the partition filter key (used by c3po). Note that only the first
     * partition is used.
     * 
     * @return the partition filter key.
     */
    public String getPartitionFilterKey() {
        return this.profile.getRootElement().element("partition").element("filter").attributeValue("id");
    }

    /**
     * Gets the objects count in the partition.
     * 
     * @return the count of objects.
     */
    public String getObjectsCountInPartition() {
        return this.profile.getRootElement().element("partition").attributeValue("count");
    }

    /**
     * Gets a human readable description of the objects in the form of
     * {@link C3POProfileParser#DESCRIPTION_OF_SAMPLES_TEMPLATE}
     * 
     * @return
     */
    public String getDescriptionOfObjects() {
        Element samples = (Element) this.profile.getRootElement().element("partition").element("samples");
        String type = samples.attributeValue("type");

        return DESCRIPTION_OF_SAMPLES_TEMPLATE.replace("{1}", type);
    }

    /**
     * Gets a human readable text description of the most prominent formats in
     * the profile. Traverses the properties of the profile until it finds the
     * format distribution. Then it takes the first two most occurring formats
     * (if existing). It also appends the percentage of conflicted and unknown
     * formats if any.
     * 
     * @return the human readable description.
     */
    public String getTypeOfObjects() {
        int count = Integer.parseInt(this.getObjectsCountInPartition());
        QName name = new QName("format", this.namespace);
        List<Element> properties = this.profile.getRootElement().element("partition").element("properties")
                .elements("property");

        List<Element> items = new ArrayList<Element>();
        for (Element e : properties) {
            if (e.attributeValue("id").equals("format")) {
                items.addAll(e.elements());
                break;
            }
        }

        if (items.isEmpty()) {
            return MISSING;
        }

        StringBuffer response = new StringBuffer();
        String type;
        double tmp;
        double percent;
        if (items.size() >= 1) {
            Element item = items.remove(0);
            type = item.attributeValue("id");
            tmp = Double.parseDouble(item.attributeValue("value"));
            percent = Math.floor((tmp / count) * 100);
            response.append(TYPE_OF_OBJECTS_BEGIN.replace("{1}", percent + "").replace("{2}", type));
        }

        if (items.size() >= 1) {// already removed first
            Element item = items.remove(0);
            type = item.attributeValue("id");
            tmp = Double.parseDouble(item.attributeValue("value"));
            percent = Math.floor((tmp / count) * 100);
            response.append(TYPE_OF_OBJECTS_SECOND.replace("{1}", percent + "").replace("{2}", type));
        }

        for (Object o : items) {
            Element e = (Element) o;
            if (e.attributeValue("id").equals("Conflicted")) {
                tmp = Double.parseDouble(e.attributeValue("value"));
                percent = Math.floor((tmp / count) * 100);
                response.append(TYPE_OF_OBJECTS_CONFLICTS.replace("{1}", percent + ""));
            } else if (e.attributeValue("id").equals("Unknown")) {
                tmp = Double.parseDouble(e.attributeValue("value"));
                percent = Math.floor((tmp / count) * 100);
                response.append(TYPE_OF_OBJECTS_UNKNOWN.replace("{1}", percent + ""));
            }
        }

        return response.toString();
    }

    /**
     * Gets a list of the sample objects and their metadata.
     * 
     * @return the list of {@link SampleObject}s
     */
    public List<SampleObject> getSampleObjects() {
        List<SampleObject> objects = new ArrayList<SampleObject>();
        Element samples = this.profile.getRootElement().element("partition").element("samples");
        for (Object s : samples.elements()) {
            Element sample = (Element) s;
            objects.add(this.parseSample(sample));
        }

        return objects;
    }

    /**
     * Gets a list of proprietary identifiers as specified in the profile.
     * Depending on the use of the profile and whether or not the profile is
     * associated with a repository these identifiers can be different. The only
     * guarantee provided by the profile is that all uids are unique (within a
     * profile).
     * 
     * @return a list of repository/file specific unique identifiers of the
     *         objects in the collection.
     */
    public List<String> getObjectIdentifiers() {
        List<String> uris = new ArrayList<String>();

        List<Element> elements = this.profile.getRootElement().element("partition").element("elements")
                .elements("element");

        for (Element e : elements) {
            uris.add(e.attributeValue("uid"));
        }

        return uris;
    }

    private SampleObject parseSample(Element sample) {

        String uid = sample.attributeValue("uid");
        SampleObject sampleObject = new SampleObject(uid);
        sampleObject.setFullname(uid);
        List<Element> mimes = new ArrayList<Element>();
        List<Element> size = new ArrayList<Element>();
        List<Element> records = sample.elements("record");

        for (Element rec : records) {
            if (rec.attributeValue("name").equals("mimetype")) {
                mimes.add(rec);
            }

            if (rec.attributeValue("name").equals("size")) {
                size.add(rec);
            }
        }

        if (mimes.size() > 1) {
            sampleObject.setContentType("Conflict");
        } else if (mimes.size() == 1) {
            Element mimetype = (Element) mimes.get(0);
            sampleObject.setContentType(mimetype.attributeValue("value"));
        }

        if (size.size() == 1) {
            Element s = (Element) size.get(0);
            sampleObject.setSizeInBytes(Long.parseLong(s.attributeValue("value")));
        }

        FormatInfo info = this.getFormatInfo(sample, sampleObject.getContentType());
        sampleObject.setFormatInfo(info);

        return sampleObject;
    }

    private FormatInfo getFormatInfo(Element sample, String mime) {
        FormatInfo info = new FormatInfo();
        info.setMimeType(mime);

        String uid = sample.attributeValue("uid");
        List<Element> records = sample.elements("record");
        List<Element> formats = new ArrayList<Element>();
        List<Element> versions = new ArrayList<Element>();
        List<Element> puids = new ArrayList<Element>();

        for (Element rec : records) {
            if (rec.attributeValue("name").equals("format")) {
                formats.add(rec);
            }

            if (rec.attributeValue("name").equals("format_version")) {
                versions.add(rec);
            }

            if (rec.attributeValue("name").equals("puid")) {
                puids.add(rec);
            }
        }

        if (formats.size() > 1) {
            info.setName("Conflict");
        } else if (formats.size() == 1) {
            Element format = (Element) formats.get(0);
            info.setName(format.attributeValue("value"));
        }

        if (versions.size() > 1) {
            info.setVersion("Conflict");
        } else if (versions.size() == 1) {
            Element version = (Element) versions.get(0);
            info.setVersion(version.attributeValue("value"));
        }

        if (puids.size() > 1) {
            info.setPuid("Conflict");
        } else if (puids.size() == 1) {
            Element puid = (Element) puids.get(0);
            info.setPuid(puid.attributeValue("value"));
        }

        return info;
    }

    // TODO read in the schema and if it is not of c3po - validate agains
    // the current schema of c3po.
    private boolean isValid(SAXParser parser, InputStream stream) {
        log.debug("validating collection profile");
        try {
            SimpleErrorHandler errorHandler = new SimpleErrorHandler();

            SAXReader reader = new SAXReader(parser.getXMLReader());
            reader.setValidation(true);

            reader.setErrorHandler(errorHandler);
            reader.read(stream);

            return errorHandler.isValid();

        } catch (SAXException e) {
            log.error("SAXException: {}", e.getMessage());
        } catch (DocumentException e) {
            e.printStackTrace();
            log.error("DocumentException: {}", e.getMessage());
        } catch (NullPointerException e) {
            log.warn("Factory is not initialized. Did you call init()");
        }

        return false;
    }

    /**
     * A simple error handler to catch if the xml document has some errors.
     * 
     * @author Petar Petrov - <me@petarpetrov.org>
     * 
     */
    private class SimpleErrorHandler implements ErrorHandler {
        private boolean valid;

        public SimpleErrorHandler() {
            this.valid = true;
        }

        @Override
        public void error(SAXParseException e) throws SAXException {
            log.error("Error: {}", e.getMessage());
            this.valid = false;
        }

        @Override
        public void fatalError(SAXParseException e) throws SAXException {
            log.error("Fatal Error: {}", e.getMessage());
            this.valid = false;

        }

        @Override
        public void warning(SAXParseException e) throws SAXException {
            log.error("Warning: {}", e.getMessage());

        }

        public boolean isValid() {
            return this.valid;
        }

    }

}