eionet.cr.dao.helpers.CsvImportHelper.java Source code

Introduction

Here is the source code for eionet.cr.dao.helpers.CsvImportHelper.java
Source

/*
 * The contents of this file are subject to the Mozilla Public
 * License Version 1.1 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 *
 * The Original Code is Content Registry 3
 *
 * The Initial Owner of the Original Code is European Environment
 * Agency. Portions created by TripleDev or Zero Technologies are Copyright
 * (C) European Environment Agency.  All Rights Reserved.
 *
 * Contributor(s):
 *        Juhan Voolaid
 */

package eionet.cr.dao.helpers;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.openrdf.model.URI;
import org.openrdf.model.vocabulary.XMLSchema;
import org.openrdf.repository.RepositoryConnection;
import org.openrdf.repository.RepositoryException;

import au.com.bytecode.opencsv.CSVReader;
import eionet.cr.common.Predicates;
import eionet.cr.common.Subjects;
import eionet.cr.dao.DAOException;
import eionet.cr.dao.DAOFactory;
import eionet.cr.dao.HarvestSourceDAO;
import eionet.cr.dao.HelperDAO;
import eionet.cr.dao.PostHarvestScriptDAO;
import eionet.cr.dto.HarvestSourceDTO;
import eionet.cr.dto.ObjectDTO;
import eionet.cr.dto.PostHarvestScriptDTO;
import eionet.cr.dto.ScriptTemplateDTO;
import eionet.cr.dto.SubjectDTO;
import eionet.cr.filestore.FileStore;
import eionet.cr.filestore.ScriptTemplateDaoImpl;
import eionet.cr.util.FolderUtil;
import eionet.cr.util.Util;
import eionet.cr.util.sesame.SesameUtil;
import eionet.cr.web.action.DataLinkingScript;
import eionet.cr.web.action.UploadCSVActionBean.FileType;
import eionet.cr.web.action.admin.postHarvest.PostHarvestScriptParser;
import eionet.cr.web.util.CharsetToolkit;

/**
 * Helper methods for importing CSV file.
 *
 * @author Juhan Voolaid
 */
public class CsvImportHelper {

    /** Column name for empty name. */
    public static final String EMPTY_COLUMN = "Empty";

    /** */
    private static final Logger LOGGER = Logger.getLogger(CsvImportHelper.class);

    /** Columns detected in the uploaded file (it's the titles of the columns). */
    private List<String> columns;

    /** Column labels detected in the uploaded file (titles without type and language code). */
    private List<String> columnLabels;

    /** The columns (i.e. column titles) forming the contained objects' unique identifiers. */
    private List<String> uniqueColumns;

    /** The URI that will be assigned to the resource representing the file. */
    private String fileUri;

    /** User can specify rdf:label for the file. */
    private String fileLabel;

    /** Uploaded file's type. */
    private FileType fileType;

    /** The type of objects contained in the file (user-given free text). */
    private String objectsType;

    /** Publisher of uploaded material. */
    private String publisher;

    /** License of uploaded material. */
    private String license;

    /** Attribution. */
    private String attribution;

    /** Source of the uploaded material. */
    private String source;

    /** Incremented id for a row, in case unique columns are not specified. */
    private int idCounter;

    /** Id formatter. */
    private DecimalFormat idFormatter;

    /**
     * Class constructor.
     *
     * @param uniqueColumns
     * @param fileUri
     * @param fileLabel
     * @param fileType
     * @param objectsType
     * @param publisher
     * @param license
     * @param attribution
     * @param source
     */
    public CsvImportHelper(List<String> uniqueColumns, String fileUri, String fileLabel, FileType fileType,
            String objectsType, String publisher, String license, String attribution, String source) {

        if (uniqueColumns == null) {
            uniqueColumns = new ArrayList<String>();
        }

        this.uniqueColumns = uniqueColumns;
        this.fileUri = fileUri;
        this.fileLabel = fileLabel;
        this.fileType = fileType;
        this.objectsType = objectsType;
        this.publisher = publisher;
        this.license = license;
        this.attribution = attribution;
        this.source = source;
        idFormatter = new DecimalFormat("000000");
    }

    /**
     * Quick way to extract the csv column labels.
     */
    public static List<String> extractColumnLabels(String folderUri, String relativeFilePath, String userName,
            FileType fileType) throws Exception {

        CsvImportHelper helper = new CsvImportHelper(null, null, null, fileType, null, null, null, null, null);
        CSVReader csvReader = null;
        try {
            csvReader = helper.createCSVReader(folderUri, relativeFilePath, userName, true);
            return helper.extractColumnLabels(helper.extractColumns(csvReader));
        } catch (Exception e) {
            throw e;
        } finally {
            close(csvReader);
        }
    }

    /**
     * Closes scv reader connection.
     *
     * @param csvReader
     */
    public static void close(CSVReader csvReader) {
        if (csvReader != null) {
            try {
                csvReader.close();
            } catch (Exception e) {
                // Ignore closing exceptions.
            }
        }
    }

    /**
     * Iserts file metadata.
     *
     * @param fileSize
     * @param userName
     * @throws Exception
     */
    public void insertFileMetadataAndSource(long fileSize, String userName) throws Exception {

        HarvestSourceDAO dao = DAOFactory.get().getDao(HarvestSourceDAO.class);
        dao.addSourceIgnoreDuplicate(HarvestSourceDTO.create(fileUri, false, 0, userName));

        String mediaType = fileType.toString();
        String lastModified = Util.virtuosoDateToString(new Date());

        dao.insertUpdateSourceMetadata(fileUri, Predicates.RDF_TYPE,
                ObjectDTO.createResource(Subjects.CR_TABLE_FILE));
        dao.insertUpdateSourceMetadata(fileUri, Predicates.CR_BYTE_SIZE, ObjectDTO.createLiteral(fileSize));
        dao.insertUpdateSourceMetadata(fileUri, Predicates.CR_MEDIA_TYPE, ObjectDTO.createLiteral(mediaType));
        dao.insertUpdateSourceMetadata(fileUri, Predicates.CR_LAST_MODIFIED, ObjectDTO.createLiteral(lastModified));
    }

    /**
     * Adds reference of the file to the given parent folder.
     *
     * @param folderUri
     * @param userName
     * @throws DAOException
     */
    public void linkFileToFolder(String folderUri, String userName) throws DAOException {

        // prepare "folder hasFile file" statement
        ObjectDTO fileObject = ObjectDTO.createResource(fileUri);
        // fileObject.setSourceUri(folderUri);
        String folderContext = FolderUtil.folderContext(folderUri);
        fileObject.setSourceUri(folderContext);
        SubjectDTO folderSubject = new SubjectDTO(folderUri, false);
        folderSubject.addObject(Predicates.CR_HAS_FILE, fileObject);

        // persist the prepared "folder hasFile file" statement
        DAOFactory.get().getDao(HelperDAO.class).addTriples(folderSubject);

        // since folder URI was used above as triple source, add it to HARVEST_SOURCE too
        // (but set interval minutes to 0, to avoid it being background-harvested)
        // HarvestSourceDTO folderHarvestSource = HarvestSourceDTO.create(folderUri, false, 0, getUserName());
        HarvestSourceDTO folderHarvestSource = HarvestSourceDTO.create(folderContext, false, 0, userName);
        DAOFactory.get().getDao(HarvestSourceDAO.class).addSourceIgnoreDuplicate(folderHarvestSource);
    }

    /**
     * @param guessEncoding
     * @return
     * @throws IOException
     */
    public CSVReader createCSVReader(String folderUri, String relativeFilePath, String userName,
            boolean guessEncoding) throws IOException {

        CSVReader result = null;
        // File file = FileStore.getInstance(getUserName()).getFile(relativeFilePath);
        File file = FileStore.getInstance(FolderUtil.getUserDir(folderUri, userName)).getFile(relativeFilePath);
        if (file != null && file.exists()) {
            if (guessEncoding) {
                Charset charset = CharsetToolkit.guessEncoding(file, 4096, Charset.forName("UTF-8"));
                result = new CSVReader(new InputStreamReader(new FileInputStream(file), charset), getDelimiter());
            } else {
                result = new CSVReader(new FileReader(file), getDelimiter());
            }
        }

        return result;
    }

    /**
     * Extracts data from csv file.
     *
     * @param csvReader
     * @throws IOException
     * @throws DAOException
     * @throws RepositoryException
     */
    public void extractObjects(CSVReader csvReader) throws IOException, DAOException, RepositoryException {

        // Set columns and columnLabels by reading the first line.
        columns = extractColumns(csvReader);
        columnLabels = extractColumnLabels(columns);

        // Read the contained objects by reading the rest of lines.
        String[] line = null;
        String objectsTypeUri = fileUri + "/" + objectsType;
        HelperDAO helperDao = DAOFactory.get().getDao(HelperDAO.class);

        RepositoryConnection conn = SesameUtil.getRepositoryConnection();
        conn.setAutoCommit(false);
        try {
            while ((line = csvReader.readNext()) != null) {
                SubjectDTO subject = extractObject(line, objectsTypeUri);
                helperDao.addTriples(conn, subject);
            }
            conn.commit();
        } catch (DAOException e) {
            SesameUtil.rollback(conn);
            throw e;
        } catch (RepositoryException e) {
            SesameUtil.rollback(conn);
            throw e;
        } finally {
            SesameUtil.close(conn);
        }

        // Construct a SPARQL query and store it as a property
        StringBuilder query = new StringBuilder();
        query.append("PREFIX tableFile: <" + fileUri + "#>\n\n");
        query.append("SELECT *\nFROM <").append(fileUri).append(">\nWHERE {\n");
        for (String column : columnLabels) {
            column = column.replace(" ", "_");
            String columnUri = "tableFile:" + column;
            query.append(" OPTIONAL { _:rec ").append(columnUri).append(" ?").append(column).append(" } .\n");
        }
        query.append("}");

        HarvestSourceDAO dao = DAOFactory.get().getDao(HarvestSourceDAO.class);
        dao.insertUpdateSourceMetadata(fileUri, Predicates.CR_SPARQL_QUERY,
                ObjectDTO.createLiteral(query.toString()));

        // Finally, make sure that the file has the correct number of harvested statements in its predicates.
        DAOFactory.get().getDao(HarvestSourceDAO.class).updateHarvestedStatementsTriple(fileUri);
    }

    /**
     * Stores the additional meta data from wizard inputs.
     *
     * @throws DAOException
     * @throws RepositoryException
     * @throws IOException
     */
    public void saveWizardInputs() throws DAOException, RepositoryException, IOException {

        HarvestSourceDAO dao = DAOFactory.get().getDao(HarvestSourceDAO.class);

        dao.insertUpdateSourceMetadata(fileUri, Predicates.CR_OBJECTS_TYPE, ObjectDTO.createLiteral(objectsType));
        if (StringUtils.isNotEmpty(fileLabel)) {
            dao.insertUpdateSourceMetadata(fileUri, Predicates.RDFS_LABEL, ObjectDTO.createLiteral(fileLabel));
        }

        ObjectDTO[] uniqueColTitles = new ObjectDTO[uniqueColumns.size()];
        for (int i = 0; i < uniqueColumns.size(); i++) {
            uniqueColTitles[i] = ObjectDTO.createLiteral(uniqueColumns.get(i));
        }

        dao.insertUpdateSourceMetadata(fileUri, Predicates.CR_OBJECTS_UNIQUE_COLUMN, uniqueColTitles);

        // Copyright information
        if (StringUtils.isNotEmpty(publisher)) {
            if (StringUtils.startsWithIgnoreCase(publisher, "http")) {
                dao.insertUpdateSourceMetadata(fileUri, Predicates.DCTERMS_PUBLISHER,
                        ObjectDTO.createResource(publisher));
            } else {
                dao.insertUpdateSourceMetadata(fileUri, Predicates.DCTERMS_PUBLISHER,
                        ObjectDTO.createLiteral(publisher));
            }
        }
        if (StringUtils.startsWithIgnoreCase(license, "http")) {
            dao.insertUpdateSourceMetadata(fileUri, Predicates.DCTERMS_LICENSE, ObjectDTO.createResource(license));
        } else {
            dao.insertUpdateSourceMetadata(fileUri, Predicates.DCTERMS_RIGHTS, ObjectDTO.createLiteral(license));
        }
        if (StringUtils.isNotEmpty(attribution)) {
            dao.insertUpdateSourceMetadata(fileUri, Predicates.DCTERMS_BIBLIOGRAPHIC_CITATION,
                    ObjectDTO.createLiteral(attribution));
        }
        if (StringUtils.isNotEmpty(source)) {
            if (StringUtils.startsWithIgnoreCase(source, "http")) {
                dao.insertUpdateSourceMetadata(fileUri, Predicates.DCTERMS_SOURCE,
                        ObjectDTO.createResource(source));
            } else {
                dao.insertUpdateSourceMetadata(fileUri, Predicates.DCTERMS_SOURCE, ObjectDTO.createLiteral(source));
            }
        }
    }

    /**
     * Saves the selected data linking script information and stores it as source specific post harvest script.
     *
     * @throws DAOException
     */
    public void saveDataLinkingScripts(List<DataLinkingScript> dataLinkingScripts) throws DAOException {
        PostHarvestScriptDAO dao = DAOFactory.get().getDao(PostHarvestScriptDAO.class);
        List<PostHarvestScriptDTO> scripts = dao.list(PostHarvestScriptDTO.TargetType.SOURCE, fileUri);

        for (DataLinkingScript dataLinkingScript : dataLinkingScripts) {

            String columnUri = fileUri + "#" + dataLinkingScript.getColumn();
            columnUri = "<" + columnUri.replace(" ", "_") + ">";

            ScriptTemplateDTO scriptTemplate = new ScriptTemplateDaoImpl()
                    .getScriptTemplate(dataLinkingScript.getScriptId());
            String script = StringUtils.replace(scriptTemplate.getScript(), "[TABLECOLUMN]", columnUri);

            int existingScriptId = isUniqueScript(scripts, fileUri, scriptTemplate.getName());
            if (existingScriptId == 0) {
                dao.insert(PostHarvestScriptDTO.TargetType.SOURCE, fileUri, scriptTemplate.getName(), script, true,
                        true);
            } else {
                dao.save(existingScriptId, scriptTemplate.getName(), script, true, true);
            }
        }
    }

    /**
     * Runs all the source specific scripts that are stored for the file uri.
     *
     * @return warning messages
     * @throws Exception
     */
    public List<String> runScripts() throws Exception {
        RepositoryConnection conn = null;
        List<String> warnings = new ArrayList<String>();
        try {
            conn = SesameUtil.getRepositoryConnection();
            conn.setAutoCommit(false);
            PostHarvestScriptDAO dao = DAOFactory.get().getDao(PostHarvestScriptDAO.class);

            List<PostHarvestScriptDTO> scripts = dao.listActive(PostHarvestScriptDTO.TargetType.SOURCE, fileUri);

            for (PostHarvestScriptDTO script : scripts) {
                String warning = runScript(script, conn);
                if (StringUtils.isNotEmpty(warning)) {
                    warnings.add(warning);
                }
            }

            conn.commit();
        } catch (Exception e) {
            SesameUtil.rollback(conn);
            throw e;
        } finally {
            SesameUtil.close(conn);
        }

        return warnings;
    }

    /**
     * Runs the script.
     *
     * @param scriptDto
     * @param conn
     * @return warning message
     */
    private String runScript(PostHarvestScriptDTO scriptDto, RepositoryConnection conn) {

        String targetUrl = scriptDto.getTargetUrl();
        String query = scriptDto.getScript();
        String title = scriptDto.getTitle();
        String parsedQuery = PostHarvestScriptParser.parseForExecution(query, targetUrl, null);

        String warningMessage = null;

        try {
            int updateCount = SesameUtil.executeSPARUL(parsedQuery, conn);
            if (updateCount > 0 && !scriptDto.isRunOnce()) {
                // run maximum 100 times
                LOGGER.debug("Script's update count was " + updateCount
                        + ", running it until the count becomes 0, or no more than 100 times ...");
                int i = 0;
                int totalUpdateCount = updateCount;
                for (; updateCount > 0 && i < 100; i++) {
                    updateCount = SesameUtil.executeSPARUL(parsedQuery, conn, targetUrl);
                    totalUpdateCount += updateCount;
                }
                LOGGER.debug("Script was run for a total of " + (i + 1) + " times, total update count = "
                        + totalUpdateCount);
            } else {
                LOGGER.debug("Script's update count was " + updateCount);
            }
        } catch (Exception e) {
            LOGGER.error("Failed to run data linking post-harvest script '" + title + "': " + e.getMessage(), e);
            warningMessage = "Failed to run data linking post-harvest script '" + title + "': " + e.getMessage();
        }

        return warningMessage;
    }

    /**
     * Checks if script with given uri and name already exists in database. If so, the id of the script is returned.
     *
     * @param scripts
     * @param uri
     * @param name
     * @return
     */
    private int isUniqueScript(List<PostHarvestScriptDTO> scripts, String uri, String name) {
        for (PostHarvestScriptDTO script : scripts) {
            if (uri.equalsIgnoreCase(script.getTargetUrl()) && name.equalsIgnoreCase(script.getTitle())) {
                return script.getId();
            }
        }
        return 0;
    }

    /**
     * Extracts columns (with language and type) from csv file.
     *
     * @param csvReader
     * @return
     * @throws IOException
     */
    private List<String> extractColumns(CSVReader csvReader) throws IOException {
        // Set columns and columnLabels by reading the first line.
        String[] columnsArray = csvReader.readNext();
        ArrayList<String> columnsResult = new ArrayList<String>();
        if (columnsArray != null && columnsArray.length > 0) {
            int emptyColCount = 1;
            for (String col : columnsArray) {
                if (StringUtils.isEmpty(col)) {
                    col = CsvImportHelper.EMPTY_COLUMN + emptyColCount++;
                }
                columnsResult.add(col.trim());
            }
        }

        return columnsResult;
    }

    /**
     * Extracts column labels (without language and type) from columns.
     *
     * @param rawColumns
     * @return
     */
    private List<String> extractColumnLabels(List<String> rawColumns) {
        ArrayList<String> columnsLabelResult = new ArrayList<String>();

        for (String col : rawColumns) {
            String colLabel = StringUtils.substringBefore(col, ":").trim();
            colLabel = StringUtils.substringBefore(colLabel, "@").trim();
            columnsLabelResult.add(colLabel);
        }

        return columnsLabelResult;
    }

    /**
     * Extracts object from csv row.
     *
     * @param line
     * @param objectsTypeUri
     * @return
     */
    private SubjectDTO extractObject(String[] line, String objectsTypeUri) {

        // Construct subject URI and DTO object.
        String subjectUri = fileUri + "/" + extractObjectId(line);
        SubjectDTO subject = new SubjectDTO(subjectUri, false);

        // Add rdf:type to DTO.
        ObjectDTO typeObject = new ObjectDTO(objectsTypeUri, false);
        typeObject.setSourceUri(fileUri);
        subject.addObject(Predicates.RDF_TYPE, typeObject);

        // Add all other values.
        for (int i = 0; i < columns.size(); i++) {

            // If current columns index out of bounds for some reason, then break.
            if (i >= line.length) {
                break;
            }

            // Get column title, skip this column if it's the label column, otherwise replace spaces.
            String column = columns.get(i);

            // Extract column type and language code
            String type = StringUtils.substringAfter(column, ":");
            if (type != null && type.length() == 0) {
                type = null;
            }
            String lang = StringUtils.substringAfter(column, "@");
            if (lang != null && lang.length() == 0) {
                lang = null;
            }

            // Get column label
            column = columnLabels.get(i);
            column = column.replace(" ", "_");

            // Create ObjectDTO representing the given column's value on this line
            ObjectDTO objectDTO = createValueObject(column, line[i], type, lang);
            objectDTO.setSourceUri(fileUri);

            // Add ObjectDTO to the subject.
            String predicateUri = fileUri + "#" + column;
            subject.addObject(predicateUri, objectDTO);

        }

        return subject;
    }

    /**
     * Retrurns unique object id.
     *
     * @param line
     * @return
     */
    private String extractObjectId(String[] line) {

        StringBuilder buf = new StringBuilder();
        String result = null;
        if (uniqueColumns != null && !uniqueColumns.isEmpty()) {

            for (String uniqueCol : uniqueColumns) {
                int colIndex = columnLabels.indexOf(uniqueCol);
                if (colIndex >= 0 && colIndex < line.length && !StringUtils.isBlank(line[colIndex])) {
                    if (buf.length() > 0) {
                        buf.append("_");
                    }
                    buf.append(line[colIndex]);
                }
            }
            result = buf.toString();
        } else {
            result = idFormatter.format(++idCounter);
        }

        return result;
    }

    /**
     * Returns rdf object value with additional type and language definitions.
     *
     * @param column
     * @param value
     * @param type
     * @param lang
     * @return
     */
    private ObjectDTO createValueObject(String column, String value, String type, String lang) {

        HashMap<String, URI> types = new HashMap<String, URI>();
        types.put("url", null);
        types.put("uri", null);
        types.put("date", XMLSchema.DATE);
        types.put("datetime", XMLSchema.DATETIME);
        types.put("boolean", XMLSchema.BOOLEAN);
        types.put("integer", XMLSchema.INTEGER);
        types.put("int", XMLSchema.INT);
        types.put("long", XMLSchema.LONG);
        types.put("double", XMLSchema.DOUBLE);
        types.put("decimal", XMLSchema.DECIMAL);
        types.put("float", XMLSchema.FLOAT);

        // If type is not defined, but column name matches one of the types, then use column name as datatype
        if (type == null) {
            if (types.keySet().contains(column.toLowerCase())) {
                type = column.toLowerCase();
            }
        }

        ObjectDTO objectDTO = null;
        if (!StringUtils.isBlank(type)) {
            if (type.equalsIgnoreCase("url") || type.equalsIgnoreCase("uri")) {
                objectDTO = new ObjectDTO(value, lang, false, false, null);
            } else if (types.keySet().contains(type.toLowerCase())) {
                if (type.equalsIgnoreCase("boolean")) {
                    value = value.equalsIgnoreCase("true") ? "true" : "false";
                }
                URI datatype = types.get(type.toLowerCase());
                objectDTO = new ObjectDTO(value, lang, true, false, datatype);
            } else if (type.equalsIgnoreCase("number")) {
                try {
                    Integer.parseInt(value);
                    objectDTO = new ObjectDTO(value, lang, true, false, XMLSchema.INTEGER);
                } catch (NumberFormatException nfe1) {
                    try {
                        Long.parseLong(value);
                        objectDTO = new ObjectDTO(value, lang, true, false, XMLSchema.LONG);
                    } catch (NumberFormatException nfe2) {
                        try {
                            Double.parseDouble(value);
                            objectDTO = new ObjectDTO(value, lang, true, false, XMLSchema.DOUBLE);
                        } catch (NumberFormatException nfe3) {
                            // No need to throw or log it.
                        }
                    }
                }
            }
        }

        return objectDTO == null ? new ObjectDTO(value, lang, true, false, null) : objectDTO;
    }

    /**
     * Returns deliminiter based of the file type.
     *
     * @return
     */
    private char getDelimiter() {
        return fileType != null && fileType.equals(FileType.TSV) ? '\t' : ',';
    }

}