Example usage for org.apache.commons.csv CSVFormat getDelimiter

List of usage examples for org.apache.commons.csv CSVFormat getDelimiter

Introduction

In this page you can find the example usage for org.apache.commons.csv CSVFormat getDelimiter.

Prototype

public char getDelimiter() 

Source Link

Document

Returns the character delimiting the values (typically ';', ',' or '\t').

Usage

From source file:org.wso2.carbon.ml.core.impl.MLModelHandler.java

public String streamingPredict(int tenantId, String userName, long modelId, String dataFormat,
        String columnHeader, InputStream dataStream) throws MLModelHandlerException {
    List<String[]> data = new ArrayList<String[]>();
    CSVFormat csvFormat = DataTypeFactory.getCSVFormat(dataFormat);
    MLModel mlModel = retrieveModel(modelId);
    BufferedReader br = new BufferedReader(new InputStreamReader(dataStream, StandardCharsets.UTF_8));
    StringBuilder predictionsWithData = new StringBuilder();
    try {//  w ww.jav a2s .c o m
        String line;
        if ((line = br.readLine()) != null && line.split(csvFormat.getDelimiter() + "").length == mlModel
                .getNewToOldIndicesList().size()) {
            if (columnHeader.equalsIgnoreCase(MLConstants.NO)) {
                String[] dataRow = line.split(csvFormat.getDelimiter() + "");
                data.add(dataRow);
            } else {
                predictionsWithData.append(line).append(MLConstants.NEW_LINE);
            }
            while ((line = br.readLine()) != null) {
                String[] dataRow = line.split(csvFormat.getDelimiter() + "");
                data.add(dataRow);
            }
            // cloning unencoded data to append with predictions
            List<String[]> unencodedData = new ArrayList<String[]>(data.size());
            for (String[] item : data) {
                unencodedData.add(item.clone());
            }
            List<?> predictions = predict(tenantId, userName, modelId, data);
            for (int i = 0; i < predictions.size(); i++) {
                predictionsWithData
                        .append(MLUtils.arrayToCsvString(unencodedData.get(i), csvFormat.getDelimiter()))
                        .append(String.valueOf(predictions.get(i))).append(MLConstants.NEW_LINE);
            }
        } else {
            int responseVariableIndex = mlModel.getResponseIndex();
            List<Integer> includedFeatureIndices = mlModel.getNewToOldIndicesList();
            List<String[]> unencodedData = new ArrayList<String[]>();
            if (columnHeader.equalsIgnoreCase(MLConstants.NO)) {
                int count = 0;
                String[] dataRow = line.split(csvFormat.getDelimiter() + "");
                unencodedData.add(dataRow.clone());
                String[] includedFeatureValues = new String[includedFeatureIndices.size()];
                for (int index : includedFeatureIndices) {
                    includedFeatureValues[count++] = dataRow[index];
                }
                data.add(includedFeatureValues);
            } else {
                predictionsWithData.append(line).append(MLConstants.NEW_LINE);
            }
            while ((line = br.readLine()) != null) {
                int count = 0;
                String[] dataRow = line.split(csvFormat.getDelimiter() + "");
                unencodedData.add(dataRow.clone());
                String[] includedFeatureValues = new String[includedFeatureIndices.size()];
                for (int index : includedFeatureIndices) {
                    includedFeatureValues[count++] = dataRow[index];
                }
                data.add(includedFeatureValues);
            }

            List<?> predictions = predict(tenantId, userName, modelId, data);
            for (int i = 0; i < predictions.size(); i++) {
                // replace with predicted value
                unencodedData.get(i)[responseVariableIndex] = String.valueOf(predictions.get(i));
                predictionsWithData
                        .append(MLUtils.arrayToCsvString(unencodedData.get(i), csvFormat.getDelimiter()));
                predictionsWithData.deleteCharAt(predictionsWithData.length() - 1);
                predictionsWithData.append(MLConstants.NEW_LINE);
            }
        }
        return predictionsWithData.toString();
    } catch (IOException e) {
        String msg = "Failed to read the data points for prediction for model [id] " + modelId;
        log.error(msg, e);
        throw new MLModelHandlerException(msg, e);
    } finally {
        try {
            if (dataStream != null && br != null) {
                dataStream.close();
                br.close();
            }
        } catch (IOException e) {
            String msg = MLUtils.getErrorMsg(String.format(
                    "Error occurred while closing the streams for model [id] %s of tenant [id] %s and [user] %s.",
                    modelId, tenantId, userName), e);
            log.warn(msg, e);
        }
    }

}

From source file:org.wso2.carbon.ml.core.impl.MLModelHandler.java

public List<?> predict(int tenantId, String userName, long modelId, String dataFormat, InputStream dataStream,
        double percentile, boolean skipDecoding) throws MLModelHandlerException {
    List<String[]> data = new ArrayList<String[]>();
    CSVFormat csvFormat = DataTypeFactory.getCSVFormat(dataFormat);
    BufferedReader br = new BufferedReader(new InputStreamReader(dataStream, StandardCharsets.UTF_8));
    try {/*ww w  .  j a  va 2  s. c o  m*/
        String line;
        while ((line = br.readLine()) != null) {
            String[] dataRow = line.split(csvFormat.getDelimiter() + "");
            data.add(dataRow);
        }
        return predict(tenantId, userName, modelId, data, percentile, skipDecoding);
    } catch (IOException e) {
        String msg = "Failed to read the data points for prediction for model [id] " + modelId;
        log.error(msg, e);
        throw new MLModelHandlerException(msg, e);
    } finally {
        try {
            dataStream.close();
            br.close();
        } catch (IOException e) {
            String msg = "Error in closing input stream while publishing model";
            log.error(msg, e);
        }
    }

}

From source file:org.wso2.carbon.ml.core.impl.MLModelHandler.java

public String streamingPredict(int tenantId, String userName, long modelId, String dataFormat,
        String columnHeader, InputStream dataStream, double percentile, boolean skipDecoding)
        throws MLModelHandlerException {
    List<String[]> data = new ArrayList<String[]>();
    CSVFormat csvFormat = DataTypeFactory.getCSVFormat(dataFormat);
    MLModel mlModel = retrieveModel(modelId);
    BufferedReader br = new BufferedReader(new InputStreamReader(dataStream, StandardCharsets.UTF_8));
    StringBuilder predictionsWithData = new StringBuilder();
    try {/*www  . ja  v  a  2s . c o m*/
        String line;
        if ((line = br.readLine()) != null && line.split(csvFormat.getDelimiter() + "").length == mlModel
                .getNewToOldIndicesList().size()) {
            if (columnHeader.equalsIgnoreCase(MLConstants.NO)) {
                String[] dataRow = line.split(csvFormat.getDelimiter() + "");
                data.add(dataRow);
            } else {
                predictionsWithData.append(line).append(MLConstants.NEW_LINE);
            }
            while ((line = br.readLine()) != null) {
                String[] dataRow = line.split(csvFormat.getDelimiter() + "");
                data.add(dataRow);
            }
            // cloning unencoded data to append with predictions
            List<String[]> unencodedData = new ArrayList<String[]>(data.size());
            for (String[] item : data) {
                unencodedData.add(item.clone());
            }
            List<?> predictions = predict(tenantId, userName, modelId, data, percentile, skipDecoding);
            for (int i = 0; i < predictions.size(); i++) {
                predictionsWithData
                        .append(MLUtils.arrayToCsvString(unencodedData.get(i), csvFormat.getDelimiter()))
                        .append(String.valueOf(predictions.get(i))).append(MLConstants.NEW_LINE);
            }
        } else {
            int responseVariableIndex = mlModel.getResponseIndex();
            List<Integer> includedFeatureIndices = mlModel.getNewToOldIndicesList();
            List<String[]> unencodedData = new ArrayList<String[]>();
            if (columnHeader.equalsIgnoreCase(MLConstants.NO)) {
                int count = 0;
                String[] dataRow = line.split(csvFormat.getDelimiter() + "");
                unencodedData.add(dataRow.clone());
                String[] includedFeatureValues = new String[includedFeatureIndices.size()];
                for (int index : includedFeatureIndices) {
                    includedFeatureValues[count++] = dataRow[index];
                }
                data.add(includedFeatureValues);
            } else {
                predictionsWithData.append(line).append(MLConstants.NEW_LINE);
            }
            while ((line = br.readLine()) != null) {
                int count = 0;
                String[] dataRow = line.split(csvFormat.getDelimiter() + "");
                unencodedData.add(dataRow.clone());
                String[] includedFeatureValues = new String[includedFeatureIndices.size()];
                for (int index : includedFeatureIndices) {
                    includedFeatureValues[count++] = dataRow[index];
                }
                data.add(includedFeatureValues);
            }

            List<?> predictions = predict(tenantId, userName, modelId, data, percentile, skipDecoding);
            for (int i = 0; i < predictions.size(); i++) {
                // replace with predicted value
                unencodedData.get(i)[responseVariableIndex] = String.valueOf(predictions.get(i));
                predictionsWithData
                        .append(MLUtils.arrayToCsvString(unencodedData.get(i), csvFormat.getDelimiter()));
                predictionsWithData.deleteCharAt(predictionsWithData.length() - 1);
                predictionsWithData.append(MLConstants.NEW_LINE);
            }
        }
        return predictionsWithData.toString();
    } catch (IOException | ArrayIndexOutOfBoundsException e) {
        String msg = "Failed to read the data points for prediction for model [id] " + modelId;
        log.error(msg, e);
        throw new MLModelHandlerException(msg, e);
    } finally {
        try {
            if (dataStream != null && br != null) {
                dataStream.close();
                br.close();
            }
        } catch (IOException e) {
            String msg = MLUtils.getErrorMsg(String.format(
                    "Error occurred while closing the streams for model [id] %s of tenant [id] %s and [user] %s.",
                    modelId, tenantId, userName), e);
            log.warn(msg, e);
        }
    }

}

From source file:org.wso2.carbon.ml.core.utils.MLUtils.java

/**
 * Generate a random sample of the dataset using Spark.
 *//*  www.j  ava2  s.c o  m*/
public static SamplePoints getSample(String path, String dataType, int sampleSize, boolean containsHeader)
        throws MLMalformedDatasetException {

    JavaSparkContext sparkContext = null;
    try {
        Map<String, Integer> headerMap = null;
        // List containing actual data of the sample.
        List<List<String>> columnData = new ArrayList<List<String>>();
        CSVFormat dataFormat = DataTypeFactory.getCSVFormat(dataType);

        // java spark context
        sparkContext = MLCoreServiceValueHolder.getInstance().getSparkContext();
        JavaRDD<String> lines;

        // parse lines in the dataset
        lines = sparkContext.textFile(path);
        // validates the data format of the file
        String firstLine = lines.first();
        if (!firstLine.contains("" + dataFormat.getDelimiter())) {
            throw new MLMalformedDatasetException(String.format(
                    "File content does not match the data format. [First Line] %s [Data Format] %s", firstLine,
                    dataType));
        }
        return getSamplePoints(sampleSize, containsHeader, headerMap, columnData, dataFormat, lines);

    } catch (Exception e) {
        throw new MLMalformedDatasetException(
                "Failed to extract the sample points from path: " + path + ". Cause: " + e, e);
    }
}

From source file:org.wso2.carbon.ml.core.utils.MLUtils.java

private static SamplePoints getSamplePoints(int sampleSize, boolean containsHeader,
        Map<String, Integer> headerMap, List<List<String>> columnData, CSVFormat dataFormat,
        JavaRDD<String> lines) {
    int featureSize;
    int[] missing;
    int[] stringCellCount;
    int[] decimalCellCount;
    // take the first line
    String firstLine = lines.first();
    // count the number of features
    featureSize = getFeatureSize(firstLine, dataFormat);

    List<Integer> featureIndices = new ArrayList<Integer>();
    for (int i = 0; i < featureSize; i++) {
        featureIndices.add(i);/* w ww.j a  v a2 s. co  m*/
    }

    String columnSeparator = String.valueOf(dataFormat.getDelimiter());
    HeaderFilter headerFilter = new HeaderFilter.Builder().header(lines.first()).build();
    JavaRDD<String> data = lines.filter(headerFilter).cache();
    Pattern pattern = MLUtils.getPatternFromDelimiter(columnSeparator);
    LineToTokens lineToTokens = new LineToTokens.Builder().separator(pattern).build();
    JavaRDD<String[]> tokens = data.map(lineToTokens);

    // remove from cache
    data.unpersist();
    // add to cache
    tokens.cache();

    missing = new int[featureSize];
    stringCellCount = new int[featureSize];
    decimalCellCount = new int[featureSize];
    if (sampleSize >= 0 && featureSize > 0) {
        sampleSize = sampleSize / featureSize;
    }
    for (int i = 0; i < featureSize; i++) {
        columnData.add(new ArrayList<String>());
    }

    if (headerMap == null) {
        // generate the header map
        if (containsHeader) {
            headerMap = generateHeaderMap(lines.first(), dataFormat);
        } else {
            headerMap = generateHeaderMap(featureSize);
        }
    }

    // take a random sample
    List<String[]> sampleLines = tokens.takeSample(false, sampleSize);

    // remove from cache
    tokens.unpersist();

    // iterate through sample lines
    for (String[] columnValues : sampleLines) {
        for (int currentCol = 0; currentCol < featureSize; currentCol++) {
            // Check whether the row is complete.
            if (currentCol < columnValues.length) {
                // Append the cell to the respective column.
                columnData.get(currentCol).add(columnValues[currentCol]);

                if (MLConstants.MISSING_VALUES.contains(columnValues[currentCol])) {
                    // If the cell is empty, increase the missing value count.
                    missing[currentCol]++;
                } else {
                    // check whether a column value is a string
                    if (!NumberUtils.isNumber(columnValues[currentCol])) {
                        stringCellCount[currentCol]++;
                    } else if (columnValues[currentCol].indexOf('.') != -1) {
                        // if it is a number and has the decimal point
                        decimalCellCount[currentCol]++;
                    }
                }
            } else {
                columnData.get(currentCol).add(null);
                missing[currentCol]++;
            }
        }
    }

    SamplePoints samplePoints = new SamplePoints();
    samplePoints.setHeader(headerMap);
    samplePoints.setSamplePoints(columnData);
    samplePoints.setMissing(missing);
    samplePoints.setStringCellCount(stringCellCount);
    samplePoints.setDecimalCellCount(decimalCellCount);
    return samplePoints;
}

From source file:org.wso2.carbon.ml.core.utils.MLUtils.java

public static Map<String, Integer> generateHeaderMap(String line, CSVFormat format) {
    Map<String, Integer> headerMap = new HashMap<String, Integer>();
    String[] values = line.split("" + format.getDelimiter());
    int i = 0;/*from ww w.j ava2 s .c om*/
    for (String value : values) {
        headerMap.put(value, i);
        i++;
    }
    return headerMap;
}

From source file:org.wso2.carbon.ml.core.utils.MLUtils.java

public static int getFeatureSize(String line, CSVFormat format) {
    String[] values = line.split("" + format.getDelimiter());
    return values.length;
}

From source file:org.wso2.carbon.ml.core.utils.MLUtils.java

public static String[] getFeatures(String line, CSVFormat format) {
    String[] values = line.split("" + format.getDelimiter());
    return values;
}

From source file:org.wso2.carbon.notebook.core.util.MLUtils.java

/**
 * Get the cell values as String tokens from table lines
 *
 * @param dataFormat Data format of the lines
 * @param lines      Table lines from which taken should be fetched
 * @return The string tokens of the table cell values
 */// w w  w  .j  a  va2s . co m
private static JavaRDD<String[]> getTokensFromLines(CSVFormat dataFormat, JavaRDD<String> lines) {
    String columnSeparator = String.valueOf(dataFormat.getDelimiter());
    HeaderFilter headerFilter = new HeaderFilter.Builder().init(lines.first()).build();

    JavaRDD<String> data = lines.filter(headerFilter).cache();
    Pattern pattern = getPatternFromDelimiter(columnSeparator);
    LineToTokens lineToTokens = new LineToTokens.Builder().separator(pattern).build();

    JavaRDD<String[]> tokens = data.map(lineToTokens);

    // remove from cache
    data.unpersist();

    return tokens;
}

From source file:org.wso2.carbon.notebook.core.util.MLUtils.java

/**
 * Generate the header map with column names
 *
 * @param line   Lines of the table for which the header map is created
 * @param format Data format of the lines
 * @return Header map// w ww  .  ja  v  a  2  s .c o m
 */
public static Map<String, Integer> generateHeaderMap(String line, CSVFormat format) {
    Map<String, Integer> headerMap = new HashMap<String, Integer>();
    String[] values = line.split("" + format.getDelimiter());
    int i = 0;
    for (String value : values) {
        headerMap.put(value, i);
        i++;
    }
    return headerMap;
}