org.pentaho.platform.dataaccess.datasource.wizard.csv.CsvUtils.java Source code

Java tutorial

Introduction

Here is the source code for org.pentaho.platform.dataaccess.datasource.wizard.csv.CsvUtils.java

Source

/*!
* This program is free software; you can redistribute it and/or modify it under the
* terms of the GNU Lesser General Public License, version 2.1 as published by the Free Software
* Foundation.
*
* You should have received a copy of the GNU Lesser General Public License along with this
* program; if not, you can obtain a copy at http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html
* or from the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* Copyright (c) 2002-2013 Pentaho Corporation..  All rights reserved.
*/

package org.pentaho.platform.dataaccess.datasource.wizard.csv;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.pentaho.di.core.row.ValueMetaInterface;
import org.pentaho.di.core.util.StringEvaluationResult;
import org.pentaho.di.core.util.StringEvaluator;
import org.pentaho.di.trans.steps.textfileinput.TextFileInput;
import org.pentaho.metadata.model.concept.types.DataType;
import org.pentaho.metadata.util.Util;
import org.pentaho.platform.dataaccess.datasource.wizard.models.ColumnInfo;
import org.pentaho.platform.dataaccess.datasource.wizard.models.CsvFileInfo;
import org.pentaho.platform.dataaccess.datasource.wizard.models.CsvParseException;
import org.pentaho.platform.dataaccess.datasource.wizard.models.DataRow;
import org.pentaho.platform.dataaccess.datasource.wizard.models.ModelInfo;
import org.pentaho.platform.dataaccess.datasource.wizard.service.agile.AgileHelper;
import org.pentaho.platform.engine.core.system.PentahoBase;
import org.pentaho.platform.engine.core.system.PentahoSystem;
import org.pentaho.platform.util.logging.Logger;
import org.pentaho.reporting.libraries.base.util.CSVTokenizer;

import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import com.thoughtworks.xstream.XStream;
import com.thoughtworks.xstream.io.xml.DomDriver;

public class CsvUtils extends PentahoBase {

    public static final List<String> NUMBER_FORMATS = Arrays.asList("#", "#,##0.###");

    private static final long serialVersionUID = 2498165533158485182L;

    private Log log = LogFactory.getLog(CsvUtils.class);
    public static final String DEFAULT_RELATIVE_UPLOAD_FILE_PATH = File.separatorChar + "system"
            + File.separatorChar + "metadata" + File.separatorChar + "csvfiles" + File.separatorChar; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
    public static final String TMP_FILE_PATH = File.separatorChar + "system" + File.separatorChar
            + File.separatorChar + "tmp" + File.separatorChar;
    //$NON-NLS-1$ //$NON-NLS-2$

    public ModelInfo getFileContents(String project, String name, String delimiter, String enclosure, int rows,
            boolean isFirstRowHeader, String encoding) throws Exception {
        String path;
        if (name.endsWith(".tmp")) { //$NON-NLS-1$
            path = PentahoSystem.getApplicationContext().getSolutionPath(TMP_FILE_PATH);
        } else {
            String relativePath = PentahoSystem.getSystemSetting("file-upload-defaults/relative-path",
                    String.valueOf(DEFAULT_RELATIVE_UPLOAD_FILE_PATH)); //$NON-NLS-1$
            path = PentahoSystem.getApplicationContext().getSolutionPath(relativePath);
        }

        String fileLocation = path + name;

        ModelInfo result = new ModelInfo();
        CsvFileInfo fileInfo = new CsvFileInfo();
        fileInfo.setTmpFilename(name);
        result.setFileInfo(fileInfo);

        fileInfo.setContents(getLinesList(fileLocation, rows, encoding));
        fileInfo.setDelimiter(delimiter);
        fileInfo.setEnclosure(enclosure);
        fileInfo.setHeaderRows(0);

        // now try to generate some columns
        return result;
    }

    public ModelInfo generateFields(String project, String filename, int rowLimit, String delimiter,
            String enclosure, int headerRows, boolean doData, boolean doColumns, String encoding) throws Exception {

        String path;
        if (filename.endsWith(".tmp")) { //$NON-NLS-1$
            path = PentahoSystem.getApplicationContext().getSolutionPath(TMP_FILE_PATH);
        } else {
            String relativePath = PentahoSystem.getSystemSetting("file-upload-defaults/relative-path",
                    String.valueOf(DEFAULT_RELATIVE_UPLOAD_FILE_PATH)); //$NON-NLS-1$
            path = PentahoSystem.getApplicationContext().getSolutionPath(relativePath);
        }

        String fileLocation = path + filename;
        return generateFields(project, fileLocation, filename, rowLimit, delimiter, enclosure, headerRows, doData,
                doColumns, encoding

        );
    }

    /* package-local visibility for testing purposes */
    ModelInfo generateFields(String project, String fileLocation, String filename, int rowLimit, String delimiter,
            String enclosure, int headerRows, boolean doData, boolean doColumns, String encoding) throws Exception {
        ModelInfo result = new ModelInfo();
        CsvFileInfo fileInfo = new CsvFileInfo();
        result.setFileInfo(fileInfo);

        CsvInspector inspector = new CsvInspector();
        String sampleLine = getLines(fileLocation, 1, encoding);
        int fileType = inspector.determineFileFormat(sampleLine);

        String contents = getLines(fileLocation, rowLimit, encoding);
        fileInfo.setContents(getLinesList(fileLocation, rowLimit, encoding));
        if (delimiter.equals("")) { //$NON-NLS-1$
            delimiter = inspector.guessDelimiter(contents);
            enclosure = "\""; //$NON-NLS-1$
            headerRows = 0;
        }
        fileInfo.setDelimiter(delimiter);
        fileInfo.setEnclosure(enclosure);
        fileInfo.setHeaderRows(headerRows);
        fileInfo.setEncoding(encoding); //Resolves the file encoding using icu4j.
        fileInfo.setProject(project);
        fileInfo.setTmpFilename(filename);

        DataProfile data = getDataProfile(fileInfo, rowLimit, fileLocation, fileType, encoding);
        if (doData) {
            result.setData(data.getRows());
        }
        if (doColumns) {
            result.setColumns(data.getColumns());
        }
        return result;
    }

    private List<String> getColumnData(int columnNumber, String[][] data) {
        List<String> dataSample = new ArrayList<String>(data.length);
        for (String[] row : data) {
            dataSample.add(row[columnNumber]);
        }
        return dataSample;
    }

    protected List<String> getLinesList(String fileLocation, int rows, String encoding) throws IOException {
        List<String> lines = new ArrayList<String>();
        try {
            File file = new File(fileLocation);
            FileInputStream fis = new FileInputStream(file);
            InputStreamReader isr = new InputStreamReader(fis, encoding);
            LineNumberReader reader = new LineNumberReader(isr);
            String line;
            int lineNumber = 0;
            while ((line = reader.readLine()) != null && lineNumber < rows) {
                lines.add(line);
                lineNumber++;
            }
            reader.close();
        } catch (Exception e) {
            log.equals(e);
        }
        return lines;
    }

    protected String getLines(String fileLocation, int rows, String encoding) {
        File file = new File(fileLocation);

        // read one line, including all EOL characters
        InputStream in;
        InputStreamReader inr = null;
        StringBuilder line = new StringBuilder();
        int count = 0;
        try {
            in = new FileInputStream(file);
            inr = new InputStreamReader(in, encoding);

            int c = inr.read();
            boolean looking = true;
            while (looking && c > 0) {
                line.append((char) c);
                if (c == '\r' || c == '\n') {
                    // look at the next char
                    c = inr.read();
                    if (c == '\r' || c == '\n') {
                        line.append((char) c);
                        c = inr.read();
                    }
                    count++;
                    if (count == rows) {
                        looking = false;
                    }
                } else {
                    c = inr.read();
                }
            }
        } catch (IOException e) {
            //do nothing
        } finally {
            if (inr != null) {
                try {
                    inr.close();
                } catch (IOException e) {
                    // ignore this one
                }
            }
        }
        return line.toString();

    }

    private DataProfile getDataProfile(CsvFileInfo fileInfo, int rowLimit, String fileLocation, int fileType,
            String encoding) throws Exception {
        DataProfile result = new DataProfile();
        String line = null;
        int row = 0;
        List<List<String>> headerSample = new ArrayList<List<String>>();
        List<List<String>> dataSample = new ArrayList<List<String>>(rowLimit);
        int maxColumns = 0;
        InputStreamReader reader = null;

        try {
            InputStream inputStream = new FileInputStream(fileLocation);
            UnicodeBOMInputStream bomIs = new UnicodeBOMInputStream(inputStream);
            reader = new InputStreamReader(bomIs, encoding);
            bomIs.skipBOM();

            //read each line of text file
            StringBuilder stringBuilder = new StringBuilder(1000);
            line = TextFileInput.getLine(null, reader, fileType, stringBuilder);

            while (line != null && row < rowLimit) {

                CSVTokenizer csvt = new CSVTokenizer(line, fileInfo.getDelimiter(), fileInfo.getEnclosure());
                List<String> rowData = new ArrayList<String>();
                int count = 0;

                while (csvt.hasMoreTokens()) {
                    String token = csvt.nextToken();
                    if (token != null) {
                        token = token.trim();
                    }
                    rowData.add(token);
                    count++;
                }
                if (maxColumns < count) {
                    maxColumns = count;
                }
                if (row < fileInfo.getHeaderRows()) {
                    headerSample.add(rowData);
                } else {
                    dataSample.add(rowData);
                }
                line = TextFileInput.getLine(null, reader, fileType, stringBuilder);
                row++;
            }

        } catch (IllegalArgumentException iae) {
            Logger.error(getClass().getSimpleName(), "There was an issue parsing the CSV file", iae); //$NON-NLS-1$
            throw new CsvParseException(row + 1, line);
        } catch (Exception e) {
            Logger.error(getClass().getSimpleName(), "Could not read CSV", e); //$NON-NLS-1$
            throw e;
        } finally {

            //close the file
            try {
                if (reader != null) {
                    reader.close();
                }
            } catch (Exception e) {
                throw e;
                // ignore
            }
        }
        String[][] headerValues = new String[headerSample.size()][maxColumns];
        int rowNo = 0;
        for (List<String> values : headerSample) {
            int colNo = 0;
            for (String value : values) {
                headerValues[rowNo][colNo] = value;
                colNo++;
            }
            rowNo++;
        }

        int[] fieldLengths = new int[maxColumns];

        String[][] dataValues = new String[dataSample.size()][maxColumns];
        DataRow[] data = new DataRow[dataSample.size()];
        rowNo = 0;
        for (List<String> values : dataSample) {
            int colNo = 0;
            for (String value : values) {
                dataValues[rowNo][colNo] = value;

                int currentMaxLength = fieldLengths[colNo];
                if (value.length() > currentMaxLength) {
                    fieldLengths[colNo] = value.length();
                }
                colNo++;
            }
            data[rowNo] = new DataRow();
            data[rowNo].setCells(dataValues[rowNo]);
            rowNo++;
        }

        result.setRows(data);

        DecimalFormat df = new DecimalFormat("000"); //$NON-NLS-1$
        ColumnInfo[] profiles = new ColumnInfo[maxColumns];
        for (int idx = 0; idx < maxColumns; idx++) {
            ColumnInfo profile = new ColumnInfo();
            profiles[idx] = profile;
            String title = CsvFileInfo.DEFAULT_COLUMN_NAME_PREFIX + df.format(idx + 1);
            String colId = "PC_" + idx; //$NON-NLS-1$

            if (headerValues.length > 0) {
                if (headerValues[headerValues.length - 1][idx] != null) {
                    title = headerValues[headerValues.length - 1][idx];
                    colId = title;
                    if (!Util.validateId(title)) {
                        colId = Util.toId(colId);
                    }
                }
            }
            profile.setTitle(title);
            profile.setId(colId);

            List<String> samples = getColumnData(idx, dataValues);

            assumeColumnDetails(profile, samples);

        }
        result.setColumns(profiles);
        return result;
    }

    protected void assumeColumnDetails(ColumnInfo profile, List<String> samples) {
        StringEvaluator eval = new StringEvaluator(false, NUMBER_FORMATS, ColumnInfo.DATE_FORMATS);
        for (String sample : samples) {
            eval.evaluateString(sample);
        }
        StringEvaluationResult result = eval.getAdvicedResult();
        ValueMetaInterface meta = result.getConversionMeta();

        int type = meta.getType();
        String mask = meta.getConversionMask();
        int size;
        int precision = meta.getPrecision();

        profile.setFormat(mask);
        profile.setPrecision(precision > 0 ? precision : 0);
        profile.setDataType(convertDataType(type));

        if (meta.isString()) {
            // pad the string lengths
            size = meta.getLength() + (meta.getLength() / 2);
        } else if (meta.isInteger()) {
            size = meta.getLength();
        } else {
            size = precision > 0 ? meta.getLength() : 0;
        }

        profile.setLength(size);
    }

    @Override
    public Log getLogger() {
        return log;
    }

    public String getEncoding(String fileName) throws Exception {

        String path;
        if (fileName.endsWith(".tmp")) { //$NON-NLS-1$
            path = PentahoSystem.getApplicationContext().getSolutionPath(TMP_FILE_PATH);
        } else {
            String relativePath = PentahoSystem.getSystemSetting("file-upload-defaults/relative-path",
                    String.valueOf(DEFAULT_RELATIVE_UPLOAD_FILE_PATH)); //$NON-NLS-1$
            path = PentahoSystem.getApplicationContext().getSolutionPath(relativePath);
        }
        String fileLocation = path + fileName;

        String encoding;
        try {
            byte[] bytes = new byte[1024];
            InputStream inputStream = new FileInputStream(new File(fileLocation));
            inputStream.read(bytes);
            CharsetDetector charsetDetector = new CharsetDetector();
            charsetDetector.setText(bytes);
            CharsetMatch charsetMatch = charsetDetector.detect();
            encoding = charsetMatch.getName();
            inputStream.close();
        } catch (Exception e) {
            log.error(e);
            throw e;
        }
        return encoding;
    }

    public ModelInfo getModelInfo(String project, String filename) throws FileNotFoundException {
        XStream xstream = new XStream(new DomDriver("UTF-8")); //$NON-NLS-1$
        xstream.alias("modelInfo", ModelInfo.class); //$NON-NLS-1$
        xstream.alias("columnInfo", ColumnInfo.class); //$NON-NLS-1$
        String filepath = AgileHelper.getFolderPath(project) + "/" + filename + ".xml"; //$NON-NLS-1$ //$NON-NLS-2$
        System.out.println(filepath);
        File f = new File(filepath);
        FileInputStream fis = new FileInputStream(f);
        return (ModelInfo) xstream.fromXML(fis);
    }

    private DataType convertDataType(int type) {
        switch (type) {
        case 1:
        case 5:
        case 6:
            return DataType.NUMERIC;
        case 3:
            return DataType.DATE;
        case 4:
            return DataType.BOOLEAN;
        default:
            return DataType.STRING;
        }
    }

    private static class DataProfile {
        DataRow[] rows = null;
        ColumnInfo[] columns = null;

        public DataRow[] getRows() {
            return rows;
        }

        public void setRows(DataRow[] rows) {
            this.rows = rows;
        }

        public ColumnInfo[] getColumns() {
            return columns;
        }

        public void setColumns(ColumnInfo[] columns) {
            this.columns = columns;
        }
    }

}