io.seqware.pipeline.plugins.FileProvenanceQueryTool.java Source code

Java tutorial

Introduction

Here is the source code for io.seqware.pipeline.plugins.FileProvenanceQueryTool.java

Source

/*
 * Copyright (C) 2013 SeqWare
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package io.seqware.pipeline.plugins;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import io.seqware.common.model.ProcessingStatus;
import io.seqware.common.model.WorkflowRunStatus;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.FileSystems;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import joptsimple.ArgumentAcceptingOptionSpec;
import joptsimple.OptionSpecBuilder;
import net.sourceforge.seqware.common.model.FileProvenanceParam;
import net.sourceforge.seqware.common.module.ReturnValue;
import net.sourceforge.seqware.common.util.Log;
import net.sourceforge.seqware.pipeline.plugin.Plugin;
import net.sourceforge.seqware.pipeline.plugin.PluginInterface;
import net.sourceforge.seqware.pipeline.plugins.fileprovenance.ProvenanceUtility;
import org.apache.commons.dbutils.DbUtils;
import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.MapListHandler;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.openide.util.lookup.ServiceProvider;

/**
 * Pulls back a file provenance report, runs an arbitrarily complex SQL query on the results, and saves the results as a protobuf binary for
 * use as a part of interpreted language deciders.
 * 
 * @author dyuen
 * @version 1.1.0
 */
@ServiceProvider(service = PluginInterface.class)
public class FileProvenanceQueryTool extends Plugin {

    private static final String TABLE_NAME = "FILE_REPORT";
    private final ArgumentAcceptingOptionSpec<String> outFileSpec;
    private final ArgumentAcceptingOptionSpec<String> querySpec;
    private final ArgumentAcceptingOptionSpec<String> inFileSpec;
    private final OptionSpecBuilder useH2InMemorySpec;

    public FileProvenanceQueryTool() {
        ProvenanceUtility.configureFileProvenanceParams(parser);
        this.inFileSpec = parser.accepts("in",
                "The tab separated file that will be used instead of pulling back a "
                        + "fresh file provenance report. "
                        + "Must be a tab separated file with a fixed number of columns with "
                        + "a provided header (that will be used for column names). ")
                .withRequiredArg();
        this.outFileSpec = parser.accepts("out", "The tab separated file into which the results will be written.")
                .withRequiredArg().required();
        this.useH2InMemorySpec = parser.accepts("H2mem",
                "Use H2 in-memory database for better performance (but with memory limits)");
        this.querySpec = parser
                .accepts("query",
                        "The standard SQL query that should be run. Table queried should be " + TABLE_NAME)
                .withRequiredArg().required();
    }

    @Override
    public String get_description() {
        return "Pulls back a file provenance report (or a previous tab-separated file), runs an arbitrarily complex SQL query on the results and saves the results as a tab separated file for use as a part of interpreted language deciders.";
    }

    @Override
    public ReturnValue init() {
        if (!options.has(inFileSpec) && !ProvenanceUtility.checkForValidOptions(options)) {
            println("One of the various contraints or '--all' must be specified.");
            println(this.get_syntax());
            return new ReturnValue(ReturnValue.INVALIDPARAMETERS);
        }

        return new ReturnValue();
    }

    @Override
    public ReturnValue do_test() {
        return new ReturnValue();
    }

    @Override
    public ReturnValue do_run() {
        Path randomTempDirectory = null;
        Path originalReport = null;
        Path bulkImportFile = null;
        try {
            if (options.has(this.inFileSpec)) {
                originalReport = FileSystems.getDefault().getPath(options.valueOf(inFileSpec));
            } else {
                originalReport = populateOriginalReportFromWS();
            }

            List<String> headers;
            List<Boolean> numericDataType;
            // construct column name and datatypes
            // convert file provenance report into derby bulk load format
            try (BufferedReader originalReader = Files.newBufferedReader(originalReport,
                    Charset.defaultCharset())) {
                // construct column name and datatypes
                String headerLine = originalReader.readLine();
                headers = Lists.newArrayList();
                numericDataType = Lists.newArrayList();
                for (String column : headerLine.split("\t")) {
                    String editedColumnName = StringUtils.lowerCase(column).replaceAll(" ", "_").replaceAll("-",
                            "_");
                    headers.add(editedColumnName);
                    // note that Parent Sample SWID is a silly column that has colons in it
                    numericDataType.add(
                            !editedColumnName.contains("parent_sample") && (editedColumnName.contains("swid")));
                }
                bulkImportFile = Files.createTempFile("import", "txt");
                try (BufferedWriter derbyImportWriter = Files.newBufferedWriter(bulkImportFile,
                        Charset.defaultCharset())) {
                    Log.debug("Bulk import file written to " + bulkImportFile.toString());
                    while (originalReader.ready()) {
                        String line = originalReader.readLine();
                        StringBuilder builder = new StringBuilder();
                        int i = 0;
                        for (String colValue : line.split("\t")) {
                            if (i != 0) {
                                builder.append("\t");
                            }
                            if (numericDataType.get(i)) {
                                if (!colValue.trim().isEmpty()) {
                                    builder.append(colValue);
                                }
                            } else {
                                // assume that this is a string
                                // need to double quotes to preserve them, see
                                // https://db.apache.org/derby/docs/10.4/tools/ctoolsimportdefaultformat.html
                                builder.append("\"").append(colValue.replaceAll("\"", "\"\"")).append("\"");
                            }
                            i++;
                        }
                        derbyImportWriter.write(builder.toString());
                        derbyImportWriter.newLine();
                    }
                }
            }
            randomTempDirectory = Files.createTempDirectory("randomFileProvenanceQueryDir");

            // try using in-memory for better performance
            String protocol = "jdbc:h2:";
            if (options.has(useH2InMemorySpec)) {
                protocol = protocol + "mem:";
            }
            Connection connection = spinUpEmbeddedDB(randomTempDirectory, "org.h2.Driver", protocol);

            // drop table if it exists already (running in IDE?)
            Statement dropTableStatement = null;
            try {
                dropTableStatement = connection.createStatement();
                dropTableStatement.executeUpdate("DROP TABLE " + TABLE_NAME);
            } catch (SQLException e) {
                Log.debug("Report table didn't exist (normal)");
            } finally {
                DbUtils.closeQuietly(dropTableStatement);
            }

            // create table creation query
            StringBuilder tableCreateBuilder = new StringBuilder();
            // tableCreateBuilder
            tableCreateBuilder.append("CREATE TABLE " + TABLE_NAME + " (");
            for (int i = 0; i < headers.size(); i++) {
                if (i != 0) {
                    tableCreateBuilder.append(",");
                }
                if (numericDataType.get(i)) {
                    tableCreateBuilder.append(headers.get(i)).append(" INT ");
                } else {
                    tableCreateBuilder.append(headers.get(i)).append(" VARCHAR ");
                }
            }
            tableCreateBuilder.append(")");

            bulkImportH2(tableCreateBuilder, connection, bulkImportFile);

            // query the database and dump the results to
            try (BufferedWriter outputWriter = Files.newBufferedWriter(Paths.get(options.valueOf(outFileSpec)),
                    Charset.defaultCharset(), StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) {
                // query the database and dump the results to
                QueryRunner runner = new QueryRunner();
                List<Map<String, Object>> mapList = runner.query(connection, options.valueOf(querySpec),
                        new MapListHandler());
                // output header
                if (mapList.isEmpty()) {
                    Log.fatal("Query had no results");
                    System.exit(-1);
                }
                StringBuilder builder = new StringBuilder();
                for (String columnName : mapList.get(0).keySet()) {
                    if (builder.length() != 0) {
                        builder.append("\t");
                    }
                    builder.append(StringUtils.lowerCase(columnName));
                }
                outputWriter.append(builder);
                outputWriter.newLine();
                for (Map<String, Object> rowMap : mapList) {
                    StringBuilder rowBuilder = new StringBuilder();
                    for (Entry<String, Object> e : rowMap.entrySet()) {
                        if (rowBuilder.length() != 0) {
                            rowBuilder.append("\t");
                        }
                        rowBuilder.append(e.getValue());
                    }
                    outputWriter.append(rowBuilder);
                    outputWriter.newLine();
                }
            }
            DbUtils.closeQuietly(connection);
            Log.stdoutWithTime("Wrote output to " + options.valueOf(outFileSpec));
            return new ReturnValue();
        } catch (IOException | SQLException | ClassNotFoundException | InstantiationException
                | IllegalAccessException ex) {
            throw new RuntimeException(ex);
        } finally {
            if (originalReport != null) {
                FileUtils.deleteQuietly(originalReport.toFile());
            }
            if (bulkImportFile != null) {
                FileUtils.deleteQuietly(bulkImportFile.toFile());
            }
            if (randomTempDirectory != null && randomTempDirectory.toFile().exists()) {
                FileUtils.deleteQuietly(randomTempDirectory.toFile());
            }

        }
    }

    private void bulkImportH2(StringBuilder tableCreateBuilder, Connection connection, Path importFile)
            throws SQLException {
        tableCreateBuilder.append("AS SELECT * FROM CSVREAD('").append(importFile.toString())
                .append("', null, 'fieldSeparator=\t')");
        Log.debug("Table creation query is: " + tableCreateBuilder.toString());
        Statement createTableStatement = null;
        try {
            createTableStatement = connection.createStatement();
            createTableStatement.executeUpdate(tableCreateBuilder.toString());
        } finally {
            DbUtils.closeQuietly(createTableStatement);
        }
    }

    private Connection spinUpEmbeddedDB(Path randomTempDirectory, String driver, String protocol)
            throws IllegalAccessException, SQLException, ClassNotFoundException, InstantiationException {
        Class.forName(driver).newInstance();
        Connection connection = DriverManager
                .getConnection(protocol + randomTempDirectory.toString() + "/tempDB;create=true");
        return connection;
    }

    private Path populateOriginalReportFromWS() throws IOException {
        Map<FileProvenanceParam, List<String>> map = ProvenanceUtility.convertOptionsToMap(options, metadata);
        // specify some standard filters that are required for filters
        map.put(FileProvenanceParam.skip, new ImmutableList.Builder<String>().add("false").build());
        map.put(FileProvenanceParam.workflow_run_status,
                new ImmutableList.Builder<String>().add(WorkflowRunStatus.completed.toString()).build());
        map.put(FileProvenanceParam.processing_status,
                new ImmutableList.Builder<String>().add(ProcessingStatus.success.toString()).build());
        Path originalReport = Files.createTempFile("file_provenance", "txt");
        Log.debug("Original report written to " + originalReport.toString());
        try (BufferedWriter originalWriter = Files.newBufferedWriter(originalReport, Charset.defaultCharset())) {
            metadata.fileProvenanceReport(map, originalWriter);
        }
        return originalReport;
    }

    @Override
    public ReturnValue clean_up() {
        return new ReturnValue();
    }
}