org.matonto.etl.service.delimited.DelimitedConverterImpl.java Source code

Introduction

Here is the source code for org.matonto.etl.service.delimited.DelimitedConverterImpl.java
Source

package org.matonto.etl.service.delimited;

/*-
 * #%L
 * org.matonto.etl.csv
 * $Id:$
 * $HeadURL:$
 * %%
 * Copyright (C) 2016 iNovex Information Systems, Inc.
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * #L%
 */

import aQute.bnd.annotation.component.Component;
import aQute.bnd.annotation.component.Reference;
import com.opencsv.CSVReader;
import org.apache.log4j.Logger;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
import org.matonto.etl.api.config.ExcelConfig;
import org.matonto.etl.api.config.SVConfig;
import org.matonto.etl.api.delimited.DelimitedConverter;
import org.matonto.etl.api.exception.MatOntoETLException;
import org.matonto.etl.api.ontologies.delimited.ClassMapping;
import org.matonto.etl.api.ontologies.delimited.ClassMappingFactory;
import org.matonto.etl.api.ontologies.delimited.Property;
import org.matonto.exception.MatOntoException;
import org.matonto.rdf.api.IRI;
import org.matonto.rdf.api.Model;
import org.matonto.rdf.api.ModelFactory;
import org.matonto.rdf.api.Resource;
import org.matonto.rdf.api.ValueFactory;
import org.matonto.rdf.orm.Thing;
import org.matonto.rest.util.CharsetUtils;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@Component(provide = DelimitedConverter.class)
public class DelimitedConverterImpl implements DelimitedConverter {
    private static final Logger LOGGER = Logger.getLogger(DelimitedConverterImpl.class);
    private static final String LOCAL_NAME_PATTERN = "\\$\\{(\\d+|UUID)\\}";
    private static final String DEFAULT_PREFIX = "http://matonto.org/data/";

    private ValueFactory valueFactory;
    private ModelFactory modelFactory;
    private ClassMappingFactory classMappingFactory;

    @Reference
    public void setValueFactory(ValueFactory valueFactory) {
        this.valueFactory = valueFactory;
    }

    @Reference
    public void setModelFactory(ModelFactory modelFactory) {
        this.modelFactory = modelFactory;
    }

    @Reference
    public void setClassMappingFactory(ClassMappingFactory classMappingFactory) {
        this.classMappingFactory = classMappingFactory;
    }

    @Override
    public Model convert(SVConfig config) throws IOException, MatOntoException {
        byte[] data = toByteArrayOutputStream(config.getData()).toByteArray();
        Charset charset = CharsetUtils.getEncoding(new ByteArrayInputStream(data))
                .orElseThrow(() -> new MatOntoException("Unsupported character set"));
        CSVReader reader = new CSVReader(new InputStreamReader(new ByteArrayInputStream(data), charset),
                config.getSeparator());
        Model convertedRDF = modelFactory.createModel();
        ArrayList<ClassMapping> classMappings = parseClassMappings(config.getMapping());
        long offset = config.getOffset();
        boolean containsHeaders = config.getContainsHeaders();

        // If headers exist, skip them
        if (containsHeaders) {
            reader.readNext();
        }

        // Skip to offset point
        while (reader.getLinesRead() - (containsHeaders ? 1 : 0) < offset) {
            System.out.println(reader.getLinesRead() - (containsHeaders ? 1 : 0));
            reader.readNext();
        }

        //Traverse each row and convert column into RDF
        String[] nextLine;
        long index = config.getOffset();
        Optional<Long> limit = config.getLimit();
        while ((nextLine = reader.readNext()) != null && (!limit.isPresent() || index < limit.get() + offset)) {
            writeClassMappingsToModel(convertedRDF, nextLine, classMappings);
            index++;
        }
        return convertedRDF;
    }

    @Override
    public Model convert(ExcelConfig config) throws IOException, MatOntoException {
        String[] nextRow;
        Model convertedRDF = modelFactory.createModel();
        ArrayList<ClassMapping> classMappings = parseClassMappings(config.getMapping());

        try {
            Workbook wb = WorkbookFactory.create(config.getData());
            Sheet sheet = wb.getSheetAt(0);
            DataFormatter df = new DataFormatter();
            boolean containsHeaders = config.getContainsHeaders();
            long offset = config.getOffset();
            Optional<Long> limit = config.getLimit();

            //Traverse each row and convert column into RDF
            for (Row row : sheet) {
                // If headers exist or the row is before the offset point, skip the row
                if ((containsHeaders && row.getRowNum() == 0)
                        || row.getRowNum() - (containsHeaders ? 1 : 0) < offset
                        || (limit.isPresent() && row.getRowNum() >= limit.get() + offset)) {
                    continue;
                }
                nextRow = new String[row.getPhysicalNumberOfCells()];
                int cellIndex = 0;
                for (Cell cell : row) {
                    nextRow[cellIndex] = df.formatCellValue(cell);
                    cellIndex++;
                }
                writeClassMappingsToModel(convertedRDF, nextRow, classMappings);
            }
        } catch (InvalidFormatException e) {
            throw new MatOntoException(e);
        }

        return convertedRDF;
    }

    /**
     * Processes a row of data into RDF using class mappings and adds it to the given Model.
     *
     * @param convertedRDF the model to hold the converted data
     * @param line the data to convert
     * @param classMappings the classMappings to use when converting the data
     */
    private void writeClassMappingsToModel(Model convertedRDF, String[] line, List<ClassMapping> classMappings) {
        // Map holds ClassMappings to instance IRIs. Modified by writeClassToModel().
        Map<Resource, IRI> mappedClasses = new HashMap<>();
        for (ClassMapping cm : classMappings) {
            convertedRDF.addAll(writeClassToModel(cm, line, mappedClasses));
        }
    }

    /**
     * Generates a UUID for use in new RDF instances. Separate method allows for testing.
     *
     * @return A String with a Universally Unique Identifier
     */
    public String generateUuid() {
        return UUID.randomUUID().toString();
    }

    /**
     * Creates a Model of RDF statements based on a class mapping and a line of data from CSV.
     *
     * @param cm       The ClassMapping object to guide the RDF creation
     * @param nextLine The line of CSV to be mapped
     * @param mappedClasses The Map holding previously processed ClassMappings and their associated instance IRIs.
     *                      Modified by this method.
     * @return A Model of RDF based on the line of CSV data
     */
    private Model writeClassToModel(ClassMapping cm, String[] nextLine, Map<Resource, IRI> mappedClasses) {
        Model convertedRDF = modelFactory.createModel();

        Optional<String> nameOptional = generateLocalName(cm, nextLine);
        if (!nameOptional.isPresent()) {
            return convertedRDF;
        }

        IRI classInstance;
        Iterator<String> prefixes = cm.getHasPrefix().iterator();
        if (prefixes.hasNext()) {
            classInstance = valueFactory.createIRI(prefixes.next() + nameOptional.get());
        } else {
            classInstance = valueFactory.createIRI(DEFAULT_PREFIX + nameOptional.get());
        }

        Resource mapsToResource;
        Iterator<Thing> mapsTo = cm.getMapsTo().iterator();
        if (mapsTo.hasNext()) {
            mapsToResource = mapsTo.next().getResource();
        } else {
            throw new MatOntoETLException(
                    "Invalid mapping configuration. Missing mapsTo property on " + cm.getResource());
        }

        convertedRDF.add(classInstance, valueFactory.createIRI(org.matonto.ontologies.rdfs.Resource.type_IRI),
                mapsToResource);
        mappedClasses.put(cm.getResource(), classInstance);

        cm.getDataProperty().forEach(dataMapping -> {
            int columnIndex = dataMapping.getColumnIndex().iterator().next();
            Property prop = dataMapping.getHasProperty().iterator().next();

            if (columnIndex < nextLine.length && columnIndex >= 0) {
                convertedRDF.add(classInstance, valueFactory.createIRI(prop.getResource().stringValue()),
                        valueFactory.createLiteral(nextLine[columnIndex]));
            } else {
                LOGGER.warn(String.format("Column %d missing for %s: %s", columnIndex, classInstance.stringValue(),
                        prop.getResource().stringValue()));
            }
        });

        cm.getObjectProperty().forEach(objectMapping -> {
            ClassMapping targetClassMapping;
            Iterator<ClassMapping> classMappingIterator = objectMapping.getClassMapping().iterator();
            if (classMappingIterator.hasNext()) {
                targetClassMapping = classMappingIterator.next();
            } else {
                throw new MatOntoETLException("Invalid mapping configuration. Missing classMapping property on "
                        + objectMapping.getResource());
            }

            Property prop = objectMapping.getHasProperty().iterator().next();

            IRI targetIri;
            if (mappedClasses.containsKey(targetClassMapping.getResource())) {
                targetIri = mappedClasses.get(targetClassMapping.getResource());
            } else {
                Optional<String> targetNameOptional = generateLocalName(targetClassMapping, nextLine);
                if (!targetNameOptional.isPresent()) {
                    return;
                } else {
                    targetIri = valueFactory.createIRI(
                            targetClassMapping.getHasPrefix().iterator().next() + targetNameOptional.get());
                    mappedClasses.put(targetClassMapping.getResource(), targetIri);
                }
            }

            convertedRDF.add(classInstance, valueFactory.createIRI(prop.getResource().stringValue()), targetIri);
        });

        return convertedRDF;
    }

    /**
     * Generates a local name for RDF Instances. If no local name is configured in the ClassMapping, a random UUID
     * is generated.
     *
     * @param cm That ClassMapping from which to retrieve the local name template if it exists
     * @param currentLine The current line in the CSV file in case data is used in the Local Name
     * @return The local name portion of a IRI used in RDF data
     */
    Optional<String> generateLocalName(ClassMapping cm, String[] currentLine) {
        Optional<String> nameOptional = cm.getLocalName();

        if (!nameOptional.isPresent() || nameOptional.get().equals("")) {
            //Only generate UUIDs when necessary. If you really have to waste a UUID go here: http://wasteaguid.info/
            return Optional.of(generateUuid());
        }

        Pattern pat = Pattern.compile(LOCAL_NAME_PATTERN);
        Matcher mat = pat.matcher(nameOptional.get());
        StringBuffer result = new StringBuffer();
        while (mat.find()) {
            if ("UUID".equals(mat.group(1))) {
                //Once again, only generate UUIDs when necessary
                mat.appendReplacement(result, generateUuid());
            } else {
                int colIndex = Integer.parseInt(mat.group(1));
                if (colIndex < currentLine.length && colIndex >= 0) {
                    mat.appendReplacement(result, currentLine[colIndex]);
                } else {
                    LOGGER.warn(String.format("Missing data for local name from column %d", colIndex));
                    return Optional.empty();
                }
            }
        }
        mat.appendTail(result);
        return Optional.of(result.toString());
    }

    /**
     * Parse the data from the Mapping File into ClassMapping POJOs
     *
     * @param mappingModel The Mapping File used to parse CSV data in a Model
     * @return An ArrayList of ClassMapping Objects created from the mapping model.
     */
    private ArrayList<ClassMapping> parseClassMappings(Model mappingModel) {
        ArrayList<ClassMapping> classMappings = new ArrayList<>();

        Model classMappingModel = mappingModel.filter(null,
                valueFactory.createIRI(org.matonto.ontologies.rdfs.Resource.type_IRI),
                valueFactory.createIRI(ClassMapping.TYPE));

        for (Resource classMappingResource : classMappingModel.subjects()) {
            ClassMapping classMapping = classMappingFactory.getExisting(classMappingResource, mappingModel);
            classMappings.add(classMapping);
        }

        return classMappings;
    }

    /**
     * Creates a ByteArrayOutputStream from an InputStream so it can be reused.
     *
     * @param in the InputStream to convert
     * @return a ByteArrayOutputStream with the contents of the InputStream
     * @throws IOException if a error occurs when accessing the InputStream contents
     */
    private ByteArrayOutputStream toByteArrayOutputStream(InputStream in) throws IOException {
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        byte[] buffer = new byte[1024];
        int read;
        while ((read = in.read(buffer, 0, buffer.length)) != -1) {
            baos.write(buffer, 0, read);
            baos.flush();
        }
        return baos;
    }
}