List of usage examples for org.apache.commons.csv CSVFormat getDelimiter
public char getDelimiter()
From source file:org.wso2.carbon.ml.core.impl.MLModelHandler.java
public String streamingPredict(int tenantId, String userName, long modelId, String dataFormat, String columnHeader, InputStream dataStream) throws MLModelHandlerException { List<String[]> data = new ArrayList<String[]>(); CSVFormat csvFormat = DataTypeFactory.getCSVFormat(dataFormat); MLModel mlModel = retrieveModel(modelId); BufferedReader br = new BufferedReader(new InputStreamReader(dataStream, StandardCharsets.UTF_8)); StringBuilder predictionsWithData = new StringBuilder(); try {// w ww.jav a2s .c o m String line; if ((line = br.readLine()) != null && line.split(csvFormat.getDelimiter() + "").length == mlModel .getNewToOldIndicesList().size()) { if (columnHeader.equalsIgnoreCase(MLConstants.NO)) { String[] dataRow = line.split(csvFormat.getDelimiter() + ""); data.add(dataRow); } else { predictionsWithData.append(line).append(MLConstants.NEW_LINE); } while ((line = br.readLine()) != null) { String[] dataRow = line.split(csvFormat.getDelimiter() + ""); data.add(dataRow); } // cloning unencoded data to append with predictions List<String[]> unencodedData = new ArrayList<String[]>(data.size()); for (String[] item : data) { unencodedData.add(item.clone()); } List<?> predictions = predict(tenantId, userName, modelId, data); for (int i = 0; i < predictions.size(); i++) { predictionsWithData .append(MLUtils.arrayToCsvString(unencodedData.get(i), csvFormat.getDelimiter())) .append(String.valueOf(predictions.get(i))).append(MLConstants.NEW_LINE); } } else { int responseVariableIndex = mlModel.getResponseIndex(); List<Integer> includedFeatureIndices = mlModel.getNewToOldIndicesList(); List<String[]> unencodedData = new ArrayList<String[]>(); if (columnHeader.equalsIgnoreCase(MLConstants.NO)) { int count = 0; String[] dataRow = line.split(csvFormat.getDelimiter() + ""); unencodedData.add(dataRow.clone()); String[] includedFeatureValues = new String[includedFeatureIndices.size()]; for (int index : includedFeatureIndices) { includedFeatureValues[count++] = dataRow[index]; } data.add(includedFeatureValues); } else { predictionsWithData.append(line).append(MLConstants.NEW_LINE); } while ((line = br.readLine()) != null) { int count = 0; String[] dataRow = line.split(csvFormat.getDelimiter() + ""); unencodedData.add(dataRow.clone()); String[] includedFeatureValues = new String[includedFeatureIndices.size()]; for (int index : includedFeatureIndices) { includedFeatureValues[count++] = dataRow[index]; } data.add(includedFeatureValues); } List<?> predictions = predict(tenantId, userName, modelId, data); for (int i = 0; i < predictions.size(); i++) { // replace with predicted value unencodedData.get(i)[responseVariableIndex] = String.valueOf(predictions.get(i)); predictionsWithData .append(MLUtils.arrayToCsvString(unencodedData.get(i), csvFormat.getDelimiter())); predictionsWithData.deleteCharAt(predictionsWithData.length() - 1); predictionsWithData.append(MLConstants.NEW_LINE); } } return predictionsWithData.toString(); } catch (IOException e) { String msg = "Failed to read the data points for prediction for model [id] " + modelId; log.error(msg, e); throw new MLModelHandlerException(msg, e); } finally { try { if (dataStream != null && br != null) { dataStream.close(); br.close(); } } catch (IOException e) { String msg = MLUtils.getErrorMsg(String.format( "Error occurred while closing the streams for model [id] %s of tenant [id] %s and [user] %s.", modelId, tenantId, userName), e); log.warn(msg, e); } } }
From source file:org.wso2.carbon.ml.core.impl.MLModelHandler.java
public List<?> predict(int tenantId, String userName, long modelId, String dataFormat, InputStream dataStream, double percentile, boolean skipDecoding) throws MLModelHandlerException { List<String[]> data = new ArrayList<String[]>(); CSVFormat csvFormat = DataTypeFactory.getCSVFormat(dataFormat); BufferedReader br = new BufferedReader(new InputStreamReader(dataStream, StandardCharsets.UTF_8)); try {/*ww w . j a va 2 s. c o m*/ String line; while ((line = br.readLine()) != null) { String[] dataRow = line.split(csvFormat.getDelimiter() + ""); data.add(dataRow); } return predict(tenantId, userName, modelId, data, percentile, skipDecoding); } catch (IOException e) { String msg = "Failed to read the data points for prediction for model [id] " + modelId; log.error(msg, e); throw new MLModelHandlerException(msg, e); } finally { try { dataStream.close(); br.close(); } catch (IOException e) { String msg = "Error in closing input stream while publishing model"; log.error(msg, e); } } }
From source file:org.wso2.carbon.ml.core.impl.MLModelHandler.java
public String streamingPredict(int tenantId, String userName, long modelId, String dataFormat, String columnHeader, InputStream dataStream, double percentile, boolean skipDecoding) throws MLModelHandlerException { List<String[]> data = new ArrayList<String[]>(); CSVFormat csvFormat = DataTypeFactory.getCSVFormat(dataFormat); MLModel mlModel = retrieveModel(modelId); BufferedReader br = new BufferedReader(new InputStreamReader(dataStream, StandardCharsets.UTF_8)); StringBuilder predictionsWithData = new StringBuilder(); try {/*www . ja v a 2s . c o m*/ String line; if ((line = br.readLine()) != null && line.split(csvFormat.getDelimiter() + "").length == mlModel .getNewToOldIndicesList().size()) { if (columnHeader.equalsIgnoreCase(MLConstants.NO)) { String[] dataRow = line.split(csvFormat.getDelimiter() + ""); data.add(dataRow); } else { predictionsWithData.append(line).append(MLConstants.NEW_LINE); } while ((line = br.readLine()) != null) { String[] dataRow = line.split(csvFormat.getDelimiter() + ""); data.add(dataRow); } // cloning unencoded data to append with predictions List<String[]> unencodedData = new ArrayList<String[]>(data.size()); for (String[] item : data) { unencodedData.add(item.clone()); } List<?> predictions = predict(tenantId, userName, modelId, data, percentile, skipDecoding); for (int i = 0; i < predictions.size(); i++) { predictionsWithData .append(MLUtils.arrayToCsvString(unencodedData.get(i), csvFormat.getDelimiter())) .append(String.valueOf(predictions.get(i))).append(MLConstants.NEW_LINE); } } else { int responseVariableIndex = mlModel.getResponseIndex(); List<Integer> includedFeatureIndices = mlModel.getNewToOldIndicesList(); List<String[]> unencodedData = new ArrayList<String[]>(); if (columnHeader.equalsIgnoreCase(MLConstants.NO)) { int count = 0; String[] dataRow = line.split(csvFormat.getDelimiter() + ""); unencodedData.add(dataRow.clone()); String[] includedFeatureValues = new String[includedFeatureIndices.size()]; for (int index : includedFeatureIndices) { includedFeatureValues[count++] = dataRow[index]; } data.add(includedFeatureValues); } else { predictionsWithData.append(line).append(MLConstants.NEW_LINE); } while ((line = br.readLine()) != null) { int count = 0; String[] dataRow = line.split(csvFormat.getDelimiter() + ""); unencodedData.add(dataRow.clone()); String[] includedFeatureValues = new String[includedFeatureIndices.size()]; for (int index : includedFeatureIndices) { includedFeatureValues[count++] = dataRow[index]; } data.add(includedFeatureValues); } List<?> predictions = predict(tenantId, userName, modelId, data, percentile, skipDecoding); for (int i = 0; i < predictions.size(); i++) { // replace with predicted value unencodedData.get(i)[responseVariableIndex] = String.valueOf(predictions.get(i)); predictionsWithData .append(MLUtils.arrayToCsvString(unencodedData.get(i), csvFormat.getDelimiter())); predictionsWithData.deleteCharAt(predictionsWithData.length() - 1); predictionsWithData.append(MLConstants.NEW_LINE); } } return predictionsWithData.toString(); } catch (IOException | ArrayIndexOutOfBoundsException e) { String msg = "Failed to read the data points for prediction for model [id] " + modelId; log.error(msg, e); throw new MLModelHandlerException(msg, e); } finally { try { if (dataStream != null && br != null) { dataStream.close(); br.close(); } } catch (IOException e) { String msg = MLUtils.getErrorMsg(String.format( "Error occurred while closing the streams for model [id] %s of tenant [id] %s and [user] %s.", modelId, tenantId, userName), e); log.warn(msg, e); } } }
From source file:org.wso2.carbon.ml.core.utils.MLUtils.java
/** * Generate a random sample of the dataset using Spark. *//* www.j ava2 s.c o m*/ public static SamplePoints getSample(String path, String dataType, int sampleSize, boolean containsHeader) throws MLMalformedDatasetException { JavaSparkContext sparkContext = null; try { Map<String, Integer> headerMap = null; // List containing actual data of the sample. List<List<String>> columnData = new ArrayList<List<String>>(); CSVFormat dataFormat = DataTypeFactory.getCSVFormat(dataType); // java spark context sparkContext = MLCoreServiceValueHolder.getInstance().getSparkContext(); JavaRDD<String> lines; // parse lines in the dataset lines = sparkContext.textFile(path); // validates the data format of the file String firstLine = lines.first(); if (!firstLine.contains("" + dataFormat.getDelimiter())) { throw new MLMalformedDatasetException(String.format( "File content does not match the data format. [First Line] %s [Data Format] %s", firstLine, dataType)); } return getSamplePoints(sampleSize, containsHeader, headerMap, columnData, dataFormat, lines); } catch (Exception e) { throw new MLMalformedDatasetException( "Failed to extract the sample points from path: " + path + ". Cause: " + e, e); } }
From source file:org.wso2.carbon.ml.core.utils.MLUtils.java
private static SamplePoints getSamplePoints(int sampleSize, boolean containsHeader, Map<String, Integer> headerMap, List<List<String>> columnData, CSVFormat dataFormat, JavaRDD<String> lines) { int featureSize; int[] missing; int[] stringCellCount; int[] decimalCellCount; // take the first line String firstLine = lines.first(); // count the number of features featureSize = getFeatureSize(firstLine, dataFormat); List<Integer> featureIndices = new ArrayList<Integer>(); for (int i = 0; i < featureSize; i++) { featureIndices.add(i);/* w ww.j a v a2 s. co m*/ } String columnSeparator = String.valueOf(dataFormat.getDelimiter()); HeaderFilter headerFilter = new HeaderFilter.Builder().header(lines.first()).build(); JavaRDD<String> data = lines.filter(headerFilter).cache(); Pattern pattern = MLUtils.getPatternFromDelimiter(columnSeparator); LineToTokens lineToTokens = new LineToTokens.Builder().separator(pattern).build(); JavaRDD<String[]> tokens = data.map(lineToTokens); // remove from cache data.unpersist(); // add to cache tokens.cache(); missing = new int[featureSize]; stringCellCount = new int[featureSize]; decimalCellCount = new int[featureSize]; if (sampleSize >= 0 && featureSize > 0) { sampleSize = sampleSize / featureSize; } for (int i = 0; i < featureSize; i++) { columnData.add(new ArrayList<String>()); } if (headerMap == null) { // generate the header map if (containsHeader) { headerMap = generateHeaderMap(lines.first(), dataFormat); } else { headerMap = generateHeaderMap(featureSize); } } // take a random sample List<String[]> sampleLines = tokens.takeSample(false, sampleSize); // remove from cache tokens.unpersist(); // iterate through sample lines for (String[] columnValues : sampleLines) { for (int currentCol = 0; currentCol < featureSize; currentCol++) { // Check whether the row is complete. if (currentCol < columnValues.length) { // Append the cell to the respective column. columnData.get(currentCol).add(columnValues[currentCol]); if (MLConstants.MISSING_VALUES.contains(columnValues[currentCol])) { // If the cell is empty, increase the missing value count. missing[currentCol]++; } else { // check whether a column value is a string if (!NumberUtils.isNumber(columnValues[currentCol])) { stringCellCount[currentCol]++; } else if (columnValues[currentCol].indexOf('.') != -1) { // if it is a number and has the decimal point decimalCellCount[currentCol]++; } } } else { columnData.get(currentCol).add(null); missing[currentCol]++; } } } SamplePoints samplePoints = new SamplePoints(); samplePoints.setHeader(headerMap); samplePoints.setSamplePoints(columnData); samplePoints.setMissing(missing); samplePoints.setStringCellCount(stringCellCount); samplePoints.setDecimalCellCount(decimalCellCount); return samplePoints; }
From source file:org.wso2.carbon.ml.core.utils.MLUtils.java
public static Map<String, Integer> generateHeaderMap(String line, CSVFormat format) { Map<String, Integer> headerMap = new HashMap<String, Integer>(); String[] values = line.split("" + format.getDelimiter()); int i = 0;/*from ww w.j ava2 s .c om*/ for (String value : values) { headerMap.put(value, i); i++; } return headerMap; }
From source file:org.wso2.carbon.ml.core.utils.MLUtils.java
public static int getFeatureSize(String line, CSVFormat format) { String[] values = line.split("" + format.getDelimiter()); return values.length; }
From source file:org.wso2.carbon.ml.core.utils.MLUtils.java
public static String[] getFeatures(String line, CSVFormat format) { String[] values = line.split("" + format.getDelimiter()); return values; }
From source file:org.wso2.carbon.notebook.core.util.MLUtils.java
/** * Get the cell values as String tokens from table lines * * @param dataFormat Data format of the lines * @param lines Table lines from which taken should be fetched * @return The string tokens of the table cell values */// w w w .j a va2s . co m private static JavaRDD<String[]> getTokensFromLines(CSVFormat dataFormat, JavaRDD<String> lines) { String columnSeparator = String.valueOf(dataFormat.getDelimiter()); HeaderFilter headerFilter = new HeaderFilter.Builder().init(lines.first()).build(); JavaRDD<String> data = lines.filter(headerFilter).cache(); Pattern pattern = getPatternFromDelimiter(columnSeparator); LineToTokens lineToTokens = new LineToTokens.Builder().separator(pattern).build(); JavaRDD<String[]> tokens = data.map(lineToTokens); // remove from cache data.unpersist(); return tokens; }
From source file:org.wso2.carbon.notebook.core.util.MLUtils.java
/** * Generate the header map with column names * * @param line Lines of the table for which the header map is created * @param format Data format of the lines * @return Header map// w ww . ja v a 2 s .c o m */ public static Map<String, Integer> generateHeaderMap(String line, CSVFormat format) { Map<String, Integer> headerMap = new HashMap<String, Integer>(); String[] values = line.split("" + format.getDelimiter()); int i = 0; for (String value : values) { headerMap.put(value, i); i++; } return headerMap; }