it.cnr.isti.thematrix.scripting.modules.MatrixFileInput.java Source code

Java tutorial

Introduction

Here is the source code for it.cnr.isti.thematrix.scripting.modules.MatrixFileInput.java

Source

/*
 * Copyright (c) 2010-2014 "HPCLab at ISTI-CNR"
 *
 * This file is part of TheMatrix.
 *
 * TheMatrix is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package it.cnr.isti.thematrix.scripting.modules;

import it.cnr.isti.thematrix.common.Enums;
import it.cnr.isti.thematrix.common.Enums.CompressionType;
import it.cnr.isti.thematrix.configuration.Dynamic;
import it.cnr.isti.thematrix.configuration.LogST;
import it.cnr.isti.thematrix.configuration.MappingSingleton;
import it.cnr.isti.thematrix.exception.JDBCConnectionException;
import it.cnr.isti.thematrix.exception.SyntaxErrorInMappingException;
import it.cnr.isti.thematrix.exception.UnsupportedDatabaseDriverException;
import it.cnr.isti.thematrix.mapping.MappingManager;
import it.cnr.isti.thematrix.mapping.utils.CSVFile;
import it.cnr.isti.thematrix.mapping.utils.TempFileManager;
import it.cnr.isti.thematrix.scripting.sys.DatasetSchema;
import it.cnr.isti.thematrix.scripting.sys.MatrixModule;
import it.cnr.isti.thematrix.scripting.sys.Symbol;
import it.cnr.isti.thematrix.scripting.sys.TheMatrixSys;
import it.cnr.isti.thematrix.scripting.utils.DateUtil;

import java.io.File;
import java.io.IOException;
import java.security.NoSuchAlgorithmException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;

import javax.xml.bind.JAXBException;

import org.apache.commons.io.filefilter.NameFileFilter;

/**
 * Class implementing the FileInputModule, which allows to read in IAD CSV files with assigned schema. <br>
 * This class relies on many classes developed when extending Jerboa into TheMatrix; the corresponding initialization
 * code must be called before we call the interpreter.
 * 
 * A constructor exists that specifies the input file is a temporary one, so no connection is allowed to the DBMS,
 * functions skip most of the related checks and initializations, and module will not report bogus errors. <br>
 * 
 * TODO hooked-in code to interact with the DBMS when a file is not found in the IAD directory, exploiting the code in
 * the MappingManager, Mapper and related classes. This will need some refinement and more error checking. If we cannot
 * download a file immediately (query generation) we should not allow the script to continue (exception is raised) but
 * we should make sure that ALL needed queries are generated. This is currently not true, we only generate the first one
 * we meet. Possible solution: encode a dev/null-like CSVFile that can be returned instead, returns a few empty rows
 * then closes. <br>
 * 
 * TODO workaround added for the file name, to address a small design issue of the syntax conversion; nothing that is
 * not a CSV file should get there, and the low-level name of the file should not in generally be fixed by the script. <br>
 * 
 * FIXME note that the most general constructors could have the last parameter as a hook to call if the file is not
 * found: to create it from the DB, or to pull the hasMore()/next calls of the writer of a temporary file (could it also
 * improve the SortModule ?). UNLIKELY Carefully consider when switching from disk based to memory based buffering.
 * 
 * 
 * @author edoardovacchi, massimo
 */
public class MatrixFileInput extends MatrixModule {
    private static final long serialVersionUID = 701236603521894984L;

    private final String inputFilename;
    private final String baseName;
    private final DatasetSchema inputSchema;
    private CSVFile inputCSV = null;
    private boolean fileIsTemporary = false;
    private CompressionType compression;
    // FIXME used only in toString() ?
    private final List<String> orderByList;

    /**
     * Constructor (chained to the most general one) which accepts a schema name.
     * 
     * @param name         name of module
     * @param inputFilename name of the file to read
     * @param inputSchema   name of the input schema
     * @param orderByList   list of fields (redundant?)
     */
    public MatrixFileInput(String name, String inputFilename, String inputSchema, List<String> orderByList) {
        this(name, inputFilename, TheMatrixSys.getPredefinedSchema(inputSchema), orderByList);
    }

    /**
     * 
     * Constructor taking a DatasetSchema object instead of a schema name.
     * 
     * @param name         name of module
     * @param inputFilename name of the file to read
     * @param inputSchema   the input schema to be used for reading
     * @param orderByList   list of fields (redundant?)
     */
    public MatrixFileInput(String name, String inputFilename, DatasetSchema inputSchema, List<String> orderByList) {
        this(name, inputFilename, inputSchema, orderByList, false);
    }

    /**
     * Most general constructor, that also allows to define the input file as a temporary one.
     * 
     * This constructor allows to specify the input as a temporary file: i.e. a file
     * that is not possible to download from DBMS, and is allowed not to exist yet at module creation time.
     * 
     * 
     * TODO check the RuntimeException, maybe use a subclass
     *
     * 
     * @param name
     *            name of module
     * @param inputFilename
     *            name of the file to read
     * @param inputSchema
     *            the input schema to be used for reading
     * @param orderByList
     *            list of fields (redundant?)
     * @param flag
     *            if true, the input file is a temporary.
     */
    public MatrixFileInput(String name, String inputFilename, DatasetSchema inputSchema, List<String> orderByList,
            boolean flag) {
        super(name);
        this.inputFilename = inputFilename;
        this.baseName = Enums.getBaseNameFile(inputFilename);
        this.compression = Enums.parseCompressionExtension(inputFilename); // we do not check the file now!

        //      this.compression = Enums.parseCompressionExtension(Dynamic.getIadPath(),inputFilename); // this will be null if file is not there

        // null = unsupported or missing file
        this.inputSchema = inputSchema;
        this.orderByList = orderByList;
        this.fileIsTemporary = flag;
    }

    /**
     * This is a Hack used for graph postprocessing. Returns the filename if it is a plain CSV file that the FileSorter
     * routines (called by MatrixSort) can read, and if the file exists already. If the file is not plain, returns the Empty string. 
     * 
     * TODO Does not bother to check if the file is temporary or not, possibly unsafe?
     * 
     * TODO when we rewrite the access to CSV files in the sorting routine, the compression layer should be below it,
     * and we can modify this routine to a simple filename getter method.
     * 
     * @return the filename, if it can be stolen, empty string otherwise.
     */
    public String fileThatCanBeStolen() {
        File f;
        if (this.compression == CompressionType.NONE) {
            f = new File(Dynamic.getIadPath() + inputFilename);
            if (f.exists() && f.length() > 0)
                return f.getAbsolutePath();
        }
        return "";
    }

    @Override
    public void setup() {
        this.setSchema(inputSchema);
        /**
         * add data module initialization when first called by the interpreter  
         * UNCLEAR/OBSOLETE COMMENT ???
         */
        // log less important for temporary files
        LogST.logP(fileIsTemporary ? 2 : 1, "MatrixFileInput.setup() : " + this.toString());
    }

    /**
     * Perform all actions to open a CSV file and allow reading from it via standard Module methods. Possibly download
     * the data from DB or create a query for manual execution. The new openFile reacts to the instance variable
     * fileIsTemporary; if it is true, then no DB download will be attempted. Performs check if the files is temporary
     * and, if it's true, skip the check if file exist.
     * 
     * If the file is temporary, now it must exists and we check its compression type is the one we expected; if the
     * file can be mapped via MappingManager and/or downloaded from DB, we will adapt to the compression type we find on
     * disk.
     * 
     * 
     */
    private void openFile()

    {
        LogST.logP(0, "MatrixFileInput.openFile() -- Module: " + this.name);

        /******************************
         * assert receivers == 1
         * Check performed here in case wrong program graph postprocessing changes the list of receivers after setup().
         */
        if (getReferenceCount() > 1) {
            LogST.logP(0, "MatrixFileInput, multiple consumers not supported");
            throw new Error("MatrixFileInput " + this.name + ": multiple consumers not supported");
        }
        /******************************/

        /*
         * Open the specified file, initializing the inputCSV field.
         * 
         * Reference code is TheMatrix/Jerboa Mapper class <br>
         * For now we will not deal here with file/value remapping and DBMS download; it will need to be connected via
         * the MappingManager, but adding a simpler function which only works on the specific dataset
         */

        MappingManager fileMapper = new MappingManager();
        boolean fileOK = false; // true if we found the CSV; (or, later, if downloaded it successfully)

        //      boolean isTemporary = TempFileManager.isTemporary(Dynamic.getIadPath(), baseName + Enums.getFileExtension(Dynamic.bufferCompression));

        // now the file shall exist: so we can check its actual suffix
        this.compression = Enums.parseCompressionExtension(Dynamic.getIadPath(), inputFilename); // this will be null if file is not there

        // FIXME: why the heck we define a new isTemporary variable inside the method, if an instance
        // variable (fileIsTemporary) is there and not used?

        boolean isTemporary = compression != null
                ? TempFileManager.isTemporary(Dynamic.getIadPath(), baseName + Enums.getFileExtension(compression))
                : false;

        if (isTemporary)
            fileOK = true;
        else {
            try {
                File path = TempFileManager.getPathForFile(inputFilename);
                fileOK = fileMapper.checkCSVFileExistence(path.getAbsolutePath() + File.separator, baseName);
            } catch (Exception e) {
                LogST.logP(0, "MatrixFileInput.opneFile() exception " + e.toString() + " for file " + baseName);
                LogST.logException(e);
                throw new RuntimeException("ERROR: MatrixFileInput() exception in mapping file " + baseName);
            }
            ;
            if (!fileOK) {
                fileOK = openFileDownloadFromDB(fileMapper);
            }
            // now the file SHALL exist, so let's detect its compression
            if (fileOK) {
                File path = TempFileManager.getPathForFile(inputFilename);
                this.compression = Enums.parseCompressionExtension(path.getAbsolutePath() + File.separator,
                        inputFilename);
                if (compression == null) //OUCH
                    throw new RuntimeException("ERROR: MatrixFileInput() internal error");
            }
        }

        /*** 
         * 3 cases now: 
         * a) CSV found OR downloaded from DB -- all OK
         * b) CSV not dowloaded, query generated  -- additional message to the user
         * c) file not found and no download could be attempted, -- we already wrote on log
         */

        if (fileOK) {
            // get the proper path and create csv iterator
            File path = TempFileManager.getPathForFile(inputFilename);
            inputCSV = new CSVFile(path.getAbsolutePath() + "/", baseName, "", this.compression);

            // provide it with the schema to enable data format checking
            inputCSV.setSchema(this.inputSchema);

            // load buffer; should check the header with the schema <<-- now it is
            inputCSV.loadBatch(Dynamic.prefetchCSVSize);
        } else if (!fileIsTemporary) { // here we handle case b)
            LogST.logP(0, "MatrixFileInput, DB download halted for file " + baseName);
        }

        /**
         * FIXME deal with the case of the query not executed in a more user friendly way, see comment inside
         * openFileDownloadFrmDB().
         **/
    }

    /**
     * Download CSV data from the DBMS in case a valid CSV file is not found but we have a DB mapping for that
     * filename. 
     * 
     * TODO Maybe a file that is not downloadable should not reach here at all.
     * 
     * @param fileMapper class providing mapping information for DM-downloadable files
     * 
     * @return true if the file was successfully downloaded.
     */
    private boolean openFileDownloadFromDB(MappingManager fileMapper) {

        //      If we get here, we know the CSV file is not found or invalid
        boolean fileOK = false; // true if we were able to download the CSV, it becomes the return value
        Collection<String> mappedFileNames = null; // the list of names defined in the mapping.xml config
        boolean workDone = false; // true only after successful creation

        /************** new code -- interaction with the Database ******************/

        try { // check if the file belongs in those defined by our mapping
            mappedFileNames = MappingSingleton.getInstance().mapping.getDatasetNames();
        } catch (Exception e1) {
            LogST.logP(0, "MatrixFileInput.openFile() - ERROR - exception while reading the mapping file "
                    + e1.toString());
            e1.printStackTrace();
        }

        if (mappedFileNames == null || !mappedFileNames.contains(baseName)) {
            LogST.logP(0,
                    "MatrixFileInput.openFileDownLoadFromDB() - ERROR - file " + baseName + " has no mapping");
            throw new Error("MatrixFileInput.openFileDownLoadFromDB() No mapping for file");
            //         System.exit(0); // FIXME we should throw exception
        }

        // if it is there, start routine to retrieve it
        try {
            // when retrieving, encode the values with the recoding tables // NOT YET
            // and dump to the CSV
            fileMapper.createDataset(new ArrayList<String>(Arrays.asList(baseName)));
            workDone = true;
        }
        /*************** Real work ends here *******************/
        catch (NoSuchAlgorithmException e) {
            LogST.logP(0, "MatrixFileInput.openFile() - MD5 encoding algorithm not found");
            e.printStackTrace();
        } catch (JAXBException e) {
            LogST.logP(0, "MatrixFileInput.openFile() - JAXB error");
            e.printStackTrace();
        } catch (IOException e) {
            LogST.logP(0, "MatrixFileInput.openFile() - file not found");
            e.printStackTrace();
        } catch (SyntaxErrorInMappingException e) {
            LogST.logP(0, "MatrixFileInput.openFile() - malformed file mapping");
            e.printStackTrace();
        } catch (SQLException e) {
            LogST.logP(0, "MatrixFileInput.openFile() - SQL error");
            e.printStackTrace();
        } catch (ClassNotFoundException e) {
            LogST.logP(0, "MatrixFileInput.openFile() - JDBC driver not found");
            e.printStackTrace();
        } catch (JDBCConnectionException e) {
            LogST.logP(0, "MatrixFileInput.openFile() - can't open JDBC connection");
            e.printStackTrace();
        } catch (UnsupportedDatabaseDriverException e) {
            LogST.logP(0, "MatrixFileInput.openFile() - JDB driver needed is not supported");
            e.printStackTrace();
        }

        if (workDone) { // only if we did not get any exception check for the CSV

            /**
             * TODO refactor, the Mapping Manager should return this information; for now we check again
             */
            try {
                fileOK = fileMapper.checkCSVFileExistence(baseName);
            } catch (Exception e) {
                LogST.logP(0, "MatrixFileInput, exception " + e.toString() + " for file " + baseName
                        + " in setup() not dowloaded");
            }
            ;

            LogST.logP(0, "MatrixFileInput.openFile() - DBMS download done for file" + baseName + " at time "
                    + new java.util.Date().toString());

            /**************
             * END of interaction with the Database ***********************
             * 
             * now if we really got the data, we are fine; but if a query for manual execution was produced instead, no
             * file to open --> the script execution cannot continue; we should gracefully generate any more required
             * data and exit;
             * 
             * in the future, this can be implemented by launching an internal exception, caught in the interpreter, or
             * floated to the outermost eval(), whose catch will scan all the modules in the script, triggering the
             * openFile() of all FileInput modules
             */
        }

        // 3 possible cases: a) csv downloaded from DB b) csv not downloaded, query generated c) error occurred
        // let the user know in case b) he has to run the query manually
        if (!fileOK && Dynamic.ignoreDBConnection) {
            LogST.logP(0, "MatrixFileInput - no DBMS connection - query dumped to text file\n"
                    + "Please execute the query, place the result in the directory of IAD files, validate the files.");
        }
        // case a) returns true, b) c) return false.
        return fileOK;
    }

    /**
     * Debugging method, returns basic info about the module. 
     * 
     */
    public String toString() {
        return String.format("FileInputModule named '%s'\n with from file: '%s'\nordered by %s", name,
                inputFilename, orderByList);

    }

    /**
     * The FileInput module does not support rewinding the file.
     */
    @Override
    public void reset() {
        //      throw new UnsupportedOperationException("Reset not supported yet.");
        LogST.logP(2, "NO-OP: MatrixFileInput.reset()");
    }

    public void exec() {
    }

    /**
     * Check if there are more data, and as a side effect, open the file. 
     */
    @Override
    public boolean hasMore() {
        if (inputCSV == null) {
            openFile();
        }
        return inputCSV.hasMore();
    }

    /**
     * Get next row and parse its fields into Symbol values. Should some way support caching a set of rows.
     * 
     * FIXME Actual parsing of the data fields should be in a different class and not inside here.
     */
    @Override
    public void next() {
        int i = 0;

        // these vars are here just for reporting
        String report_val = null;
        int report_i = -2;

        while (hasMore())
            try {
                List<String> columns = inputCSV.next();
                List<Symbol<?>> attrs = this.attributes();
                int nHead = attrs.size();
                for (i = 0; i < nHead; i++) {
                    Symbol<?> s = attrs.get(i);
                    String val = columns.get(i);
                    report_val = val;
                    report_i = i; // for reporting
                    if (val.isEmpty()) {
                        s.setValue(null);
                    } else
                        switch (s.type) {
                        case INT: {
                            s.setValue(Integer.parseInt(val));
                            break;
                        }
                        case FLOAT: {
                            s.setValue(Float.parseFloat(val));
                            break;
                        }
                        case BOOLEAN: {
                            try {
                                int parsedIntValue = Integer.parseInt(val);
                                if (parsedIntValue == 0)
                                    s.setValue(false);
                                else
                                    s.setValue(true);
                            } catch (NumberFormatException ex) {
                                s.setValue(Boolean.parseBoolean(val));
                            }
                            break;
                        }
                        case STRING: {
                            s.setValue(val);
                            break;
                        }
                        case DATE: {
                            s.setValue(DateUtil.parse(val));
                            break;
                        }
                        }
                }
                return; // for ended without any exception
            }
            /***
             * here we should catch any parsing exception and report them to the user in useful way; we
             * interact with the input routine so that the whole row is marked as bad.
             */
            catch (IllegalArgumentException e) {
                // catches NumberFormatE from int/float as well as exceptions form DateUtil
                LogST.logP(1, "MatrixFileInput : discarding input line, caught exception while parsing field " + i
                        + " " + attributes().get(i).toString());
                //            LogST.logException(e); // only in logs!
                LogST.logP(2, "Exception caught: " + e.toString());

                // currently we mostly do the same for temporary and permanent files
                if (fileIsTemporary == false)
                    LogST.errorParsing(this.name, this.inputFilename, report_val, inputCSV.getRowCursor() + "",
                            i + " = " + this.inputSchema.attributes().get(i).name,
                            attributes().get(i).type.toString());
                else
                    LogST.errorParsing(this.name, "temporary file"/*this.inputFilename*/, report_val,
                            inputCSV.getRowCursor() + "", i + " = " + this.inputSchema.attributes().get(i).name,
                            attributes().get(i).type.toString());

                // this will re-execute the while body, discarding the current line
                continue;
            }

        /**
         * if we get out of the while it means either next() was called on exhausted input, or at least one input line
         * was discarded because of parsing errors; 
         * 
         * FIXME should throw a specific exception!
         */
        LogST.logP(0, "MatrixFileInput : empty input line, caught exception while parsing ");
    }

}