com.petpet.c3po.analysis.ProfileGenerator.java Source code

Java tutorial

Introduction

Here is the source code for com.petpet.c3po.analysis.ProfileGenerator.java

Source

/*******************************************************************************
 * Copyright 2013 Petar Petrov <me@petarpetrov.org>
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package com.petpet.c3po.analysis;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.*;

import com.petpet.c3po.utils.VocabularyDP;
import org.bson.types.ObjectId;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.dom4j.io.OutputFormat;
import org.dom4j.io.XMLWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.petpet.c3po.api.dao.PersistenceLayer;
import com.petpet.c3po.api.model.Property;
import com.petpet.c3po.api.model.Source;
import com.petpet.c3po.api.model.helper.Filter;
import com.petpet.c3po.api.model.helper.FilterCondition;
import com.petpet.c3po.api.model.helper.MetadataRecord;
import com.petpet.c3po.api.model.helper.MetadataRecord.Status;
import com.petpet.c3po.api.model.helper.NumericStatistics;
import com.petpet.c3po.api.model.helper.PropertyType;

/**
 * Generates a c3po content profile in an xml format according to the
 * c3po/format/c3po.xsd schema.
 *
 * @author Petar Petrov <me@petarpetrov.org>
 *
 */
public class ProfileGenerator {

    /**
     * Default logger.
     */
    private static final Logger LOG = LoggerFactory.getLogger(ProfileGenerator.class);

    /**
     * The {@link com.petpet.c3po.api.model.Element} class.
     */
    private static final Class<com.petpet.c3po.api.model.Element> ELEMENT_CLASS = com.petpet.c3po.api.model.Element.class;

    /**
     * A set of properties that will be expanded with histograms.
     */
    private static final String[] PROPERTIES = { "format", "format_version", "puid", "mimetype", "charset",
            "linebreak", "compressionscheme", "creating_os", "byteorder", "compression_scheme", "colorspace",
            "icc_profile_name", "icc_profile_version", "created", "creating.application.name", "dissimilarities" };

    /**
     * The persistence layer.
     */
    private PersistenceLayer persistence;

    /**
     * The representative samples generator.
     */
    private final RepresentativeGenerator sampleSelector;

    /**
     * Creates this generator.
     *
     * @param persistence
     *          the persistence to use.
     * @param generator
     *          the samples generator to use.
     */
    public ProfileGenerator(final PersistenceLayer persistence, RepresentativeGenerator generator) {
        this.persistence = persistence;
        this.sampleSelector = generator;
        VocabularyDP.init();
    }

    /**
     * writes the xml string to a local folder 'profiles' in a file called
     * output.xml.
     *
     * @param xml
     *          the xml to write.
     */
    public void write(final String xml) {
        try {
            final Document doc = DocumentHelper.parseText(xml);
            this.write(doc);

        } catch (final DocumentException e) {
            LOG.warn("An error occurred: {}", e.getMessage());
        }
    }

    /**
     * Writes the xml document to a local 'profiles' folder.
     *
     * @param doc
     *          the document to write.
     */
    public void write(final Document doc) {
        this.write(doc, "profiles/output.xml");
    }

    /**
     * Writes the given document to the given path.
     *
     * @param doc
     *          the document to write.
     * @param path
     *          the path to write to.
     */
    public void write(final Document doc, final String path) {
        try {
            final OutputFormat format = OutputFormat.createPrettyPrint();
            final File file = new File(path);

            LOG.info("Will create profile in {}", file.getAbsolutePath());

            if (file.getParentFile() != null && !file.getParentFile().exists()) {
                file.getParentFile().mkdirs();
            }

            file.createNewFile();

            final XMLWriter writer = new XMLWriter(new FileWriter(path), format);
            writer.write(doc);
            writer.close();

        } catch (final IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * Generates a profile for the given filter with a default sample size set of
     * 5 and without including the element identifiers.
     *
     * @param filter
     *          the filter to apply.
     * @return a xml document for the given profile.
     */
    public Document generateProfile(Filter filter) {
        return this.generateProfile(filter, 5, false);
    }

    /**
     * Generates a profile matching the given filter with the given sample set
     * size and including the elements if set to true.
     *
     * @param filter
     *          the filter to use.
     * @param sampleSize
     *          the size of the samples.
     * @param includeelements
     *          whether or not to include the element identifiers.
     * @return the xml document of the profile.
     */
    public Document generateProfile(Filter filter, int sampleSize, boolean includeelements) {
        // TODO check if subFilter is changed
        // if not then it does not have a collection filter...
        // and the collection name should be different...
        Filter subFilter = filter.subFilter("collection");
        final long count = this.persistence.count(ELEMENT_CLASS, subFilter);

        final Document document = DocumentHelper.createDocument();
        final String name = this.getCollectionNameFromFilter(filter);
        final Element root = this.createRootElement(document, name, count);
        final Element partition = this.createPartition(root, filter);
        this.genereateFilterElement(partition, filter);
        final Element properties = this.createPropertiesElement(partition);
        this.generateProperties(filter, properties);
        this.createSamples(filter, partition, sampleSize);
        this.createElements(filter, partition, includeelements);

        return document;
    }

    /**
     * Returns the name of the collection from the filter. If the filter is null,
     * then 'all-data' is returned. If the filter is not null, but does not
     * contain a collection condition, then an empty string is returned.
     *
     * @param filter
     *          the filter to check.
     * @return the name of the collection.
     */
    private String getCollectionNameFromFilter(Filter filter) {

        if (filter == null) {
            return "all-data";
        }

        String result = "";
        List<FilterCondition> conditions = filter.getConditions();
        for (FilterCondition fc : conditions) {
            if (fc.getField().equals("collection")) {
                result += fc.getValue().toString() + " ";
            }
        }

        return result;
    }

    private void genereateFilterElement(Element partition, Filter filter) {
        Element elmntFilter = partition.addElement("filter");
        // TODO get rid of id
        elmntFilter.addAttribute("id", UUID.randomUUID().toString());
        Element parameters = elmntFilter.addElement("parameters");

        for (FilterCondition fc : filter.getConditions()) {
            Element parameter = parameters.addElement("parameter");
            parameter.addElement("name").addText(fc.getField());
            parameter.addElement("value").addText(fc.getValue().toString());
        }

    }

    private Element createPartition(Element root, Filter filter) {

        long count = this.persistence.count(ELEMENT_CLASS, filter);

        final Element partition = root.addElement("partition").addAttribute("count", count + "");
        return partition;
    }

    private void generateProperties(final Filter filter, final Element properties) {
        Iterator<Property> allprops = this.persistence.find(Property.class, null);

        while (allprops.hasNext()) {
            Property p = allprops.next();

            Filter copy = new Filter(filter);

            if (!copy.contains(p.getId())) {
                copy.addFilterCondition(new FilterCondition(p.getId(), null));
            }

            long count = this.persistence.count(ELEMENT_CLASS, copy);

            if (count != 0) {
                this.createPropertyElement(filter, properties, p, count);
            }
        }
    }

    private void createPropertyElement(final Filter filter, final Element properties, final Property p,
            long count) {

        final Element prop;
        String uri = VocabularyDP.getUriByName(p.getKey());

        if (uri == null) {
            prop = properties.addElement("property").addAttribute("id", p.getKey())
                    .addAttribute("type", p.getType()).addAttribute("count", count + "");
        } else {
            prop = properties.addElement("property").addAttribute("id", p.getKey()).addAttribute("uri", uri)
                    .addAttribute("type", p.getType()).addAttribute("count", count + "");
        }
        final PropertyType type = PropertyType.valueOf(p.getType());

        switch (type) {
        case STRING:
            this.processStringProperty(filter, prop, p);
            break;
        case BOOL:
            this.processBoolProperty(filter, prop, p);
            break;
        case INTEGER:
        case FLOAT:
            this.processNumericProperty(filter, prop, p);
            break;
        case DATE:
            this.processDateProperty(filter, prop, p);
            break;
        }
    }

    private Element createRootElement(final Document doc, final String collection, final long count) {
        final Element profile = doc.addElement("profile", "http://ifs.tuwien.ac.at/dp/c3po")
                .addAttribute("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance")
                .addAttribute("collection", collection).addAttribute("date", new Date() + "")
                .addAttribute("count", count + "");

        return profile;
    }

    private Element createPropertiesElement(final Element partition) {
        return partition.addElement("properties");
    }

    private void createSamples(final Filter filter, final Element partition, int sampleSize) {
        final Element samples = partition.addElement("samples");
        samples.addAttribute("type", this.sampleSelector.getType());
        this.sampleSelector.setFilter(filter);
        final List<String> output = this.sampleSelector.execute(sampleSize);

        LOG.debug("Found {} representatives", output.size());
        for (String s : output) {
            LOG.debug("Processing sample {}", s);
            createSampleElement(samples, s);
        }
    }

    private void createSampleElement(final Element samples, final String id) {
        Iterator<com.petpet.c3po.api.model.Element> iter = this.persistence.find(ELEMENT_CLASS,
                new Filter(new FilterCondition("_id", new ObjectId(id))));

        assert iter.hasNext();

        com.petpet.c3po.api.model.Element element = iter.next();

        Element sample = samples.addElement("sample").addAttribute("id", id);
        for (MetadataRecord mr : element.getMetadata()) {
            LOG.debug("Metadata record: {}", mr.getProperty());
            if (mr.getValues().size() == 0)
                continue;
            if (mr.getStatus().equals(Status.CONFLICT.toString())) {
                for (int i = 0; i < mr.getValues().size(); i++) {
                    // Iterator<Source> sources = this.persistence.find( Source.class, new Filter( new FilterCondition( "_id", mr
                    //           .getSources().get( i ) ) ) );
                    // Source source = sources.next();
                    sample.addElement("record").addAttribute("name", mr.getProperty())
                            .addAttribute("value", mr.getValues().get(i).toString())
                            .addAttribute("tool", mr.getSources().get(0));
                }

            } else {

                //Iterator<Source> sources = this.persistence.find( Source.class, new Filter( new FilterCondition( "_id", mr
                //       .getSources().get( 0 ) ) ) );
                //Source source = sources.next();
                sample.addElement("record").addAttribute("name", mr.getProperty())
                        .addAttribute("value", mr.getValues().toString())
                        .addAttribute("tool", mr.getSources().get(0));
            }
        }
    }

    private void createElements(final Filter filter, final Element partition, boolean includeelements) {
        final Element elements = partition.addElement("elements");

        if (includeelements) {

            Iterator<com.petpet.c3po.api.model.Element> iter = this.persistence.find(ELEMENT_CLASS, filter);

            while (iter.hasNext()) {
                com.petpet.c3po.api.model.Element element = iter.next();
                elements.addElement("element").addAttribute("uid", element.getUid());
            }
        }
    }

    private void processStringProperty(final Filter filter, final Element prop, final Property p) {
        for (final String s : PROPERTIES) {
            if (p.getKey().equals(s)) {
                List<String> properties = new ArrayList<String>();
                properties.add(p.getKey());
                Map<String, Map<String, Long>> histograms = this.persistence.getHistograms(properties, filter,
                        null);
                Map<String, Long> histogram = histograms.get(p.getKey());

                for (String key : histogram.keySet()) {
                    Long val = histogram.get(key);
                    prop.addElement("item").addAttribute("id", key).addAttribute("value", val + "");
                }

                break;
            }
        }
    }

    private void processBoolProperty(final Filter filter, final Element prop, final Property p) {

        List<String> properties = new ArrayList<String>();
        properties.add(p.getKey());
        Map<String, Map<String, Long>> histograms = this.persistence.getHistograms(properties, filter, null);
        Map<String, Long> histogram = histograms.get(p.getKey());

        for (String key : histogram.keySet()) {
            Long val = histogram.get(key);
            prop.addElement("item").addAttribute("value", key).addAttribute("count", val + "");
        }

    }

    // if also a histogram is done, do not forget the bin_width...
    private void processNumericProperty(final Filter filter, final Element prop, final Property p) {

        List<String> properties = new ArrayList<String>();
        properties.add(p.getKey());
        Map<String, Map<String, Long>> histograms = this.persistence.getHistograms(properties, filter, null);
        Map<String, Long> histogram = histograms.get(p.getKey());

        prop.addAttribute("count", histogram.get("count") + "");
        prop.addAttribute("sum", histogram.get("sum") + "");
        prop.addAttribute("min", histogram.get("min") + "");
        prop.addAttribute("max", histogram.get("max") + "");
        prop.addAttribute("avg", histogram.get("avg") + "");
        prop.addAttribute("var", histogram.get("var") + "");
        prop.addAttribute("sd", histogram.get("std") + "");
    }

    private void processDateProperty(Filter filter, Element prop, Property p) {

        List<String> properties = new ArrayList<String>();
        properties.add(p.getKey());
        Map<String, Map<String, Long>> histograms = this.persistence.getHistograms(properties, filter, null);
        Map<String, Long> histogram = histograms.get(p.getKey());

        for (String key : histogram.keySet()) {
            Long val = histogram.get(key);
            prop.addElement("item").addAttribute("id", key).addAttribute("value", val + "");
        }
    }

}