List of usage examples for org.apache.commons.csv CSVFormat RFC4180
CSVFormat RFC4180
To view the source code for org.apache.commons.csv CSVFormat RFC4180.
Click Source Link
From source file:org.talend.components.localio.runtime.fixed.FixedDatasetRuntime.java
@Override public Schema getSchema() { switch (properties.format.getValue()) { case CSV:// w w w . j av a 2s . c o m // Try to get the schema from the specified value. String csvSchema = properties.csvSchema.getValue(); if (!csvSchema.trim().isEmpty()) { try { CSVRecord r = CSVFormat.RFC4180 // .withDelimiter(properties.getFieldDelimiter().charAt(0)) // .withRecordSeparator(properties.getRecordDelimiter()) // .parse(new StringReader(csvSchema)).iterator().next(); return CsvRecordToIndexedRecordConverter.inferSchema(r); } catch (Exception e) { throw LocalIOErrorCode.createCannotParseSchema(e, csvSchema); } } // Fall back to a schema based on the number of columns. try { int maxSize = 0; for (CSVRecord r : CSVFormat.RFC4180 // .withDelimiter(properties.getFieldDelimiter().charAt(0)) // .withRecordSeparator(properties.getRecordDelimiter()) .parse(new StringReader(properties.values.getValue()))) maxSize = Math.max(maxSize, r.size()); if (maxSize == 0) throw LocalIOErrorCode.requireAtLeastOneRecord(new RuntimeException()); return CsvRecordToIndexedRecordConverter.inferSchema(maxSize); } catch (IOException e) { throw LocalIOErrorCode.createCannotParseSchema(e, properties.values.getValue()); } case JSON: if (properties.values.getValue().trim().isEmpty()) throw LocalIOErrorCode.requireAtLeastOneRecord(new RuntimeException()); return getValues(1).get(0).getSchema(); case AVRO: try { return new Schema.Parser().parse(properties.schema.getValue()); } catch (Exception e) { throw LocalIOErrorCode.createCannotParseSchema(e, properties.schema.getValue()); } } throw LocalIOErrorCode.createCannotParseSchema(null, properties.schema.getValue()); }
From source file:org.talend.components.localio.runtime.fixed.FixedDatasetRuntime.java
public List<IndexedRecord> getValues(int limit) { List<IndexedRecord> values = new ArrayList<>(); switch (properties.format.getValue()) { case CSV://from w ww. j av a 2s . c o m try { CsvRecordToIndexedRecordConverter converter = new CsvRecordToIndexedRecordConverter(getSchema()); for (CSVRecord r : CSVFormat.RFC4180 // .withDelimiter(properties.getFieldDelimiter().charAt(0)) // .withRecordSeparator(properties.getRecordDelimiter()) .parse(new StringReader(properties.values.getValue()))) values.add(converter.convertToAvro(r)); } catch (IOException e) { throw LocalIOErrorCode.createCannotParseSchema(e, properties.values.getValue()); } break; case JSON: ObjectMapper mapper = new ObjectMapper(); JsonSchemaInferrer jsonSchemaInferrer = new JsonSchemaInferrer(mapper); JsonGenericRecordConverter converter = null; JsonFactory jsonFactory = new JsonFactory(); try (StringReader r = new StringReader(properties.values.getValue())) { Iterator<JsonNode> value = mapper.readValues(jsonFactory.createParser(r), JsonNode.class); int count = 0; while (value.hasNext() && count++ < limit) { String json = value.next().toString(); if (converter == null) { Schema jsonSchema = jsonSchemaInferrer.inferSchema(json); converter = new JsonGenericRecordConverter(jsonSchema); } values.add(converter.convertToAvro(json)); } } catch (IOException e) { throw LocalIOErrorCode.createCannotParseJson(e, properties.schema.getValue(), properties.values.getValue()); } break; case AVRO: Schema schema = getSchema(); if (isRandom()) { GeneratorFunction<IndexedRecord> gf = (GeneratorFunction<IndexedRecord>) GeneratorFunctions .of(getSchema()); GeneratorFunction.GeneratorContext ctx = GeneratorFunction.GeneratorContext.of(0, 0L); for (int i = 0; i < limit; i++) { ctx.setRowId(i); values.add(gf.apply(ctx)); } } else { try (ByteArrayInputStream bais = new ByteArrayInputStream( properties.values.getValue().trim().getBytes())) { JsonDecoder decoder = DecoderFactory.get().jsonDecoder(schema, bais); DatumReader<IndexedRecord> reader = new GenericDatumReader<>(schema); int count = 0; while (count++ < limit) { values.add(reader.read(null, decoder)); } } catch (EOFException e) { // Indicates the end of the values. } catch (IOException e) { throw LocalIOErrorCode.createCannotParseAvroJson(e, properties.schema.getValue(), properties.values.getValue()); } } break; } return values; }
From source file:org.thegalactic.context.io.ContextSerializerCsv.java
/** * Read a context from a csv file./*from w w w.j av a 2s . c om*/ * * The following format is respected: * * The first line contains the attribute names, the other lines contains the * observations identifier followed by boolean values * * ~~~ * "",a,b,c,d,e * 1,1,0,1,0,0 * 2,1,1,0,0,0 * 3,0,1,0,1,1 * 4,0,0,1,0,1 * ~~~ * * If the first attribute is the empty string, the first column corresponds * to the individual identifiers. In the other case, the individual * identifiers will be generated by successive integers. * * ~~~ * a,b,c,d,e * 1,0,1,0,0 * 1,1,0,0,0 * 0,1,0,1,1 * 0,0,1,0,1 * ~~~ * * @param context a context to read * @param file a file * * @throws IOException When an IOException occurs */ public void read(Context context, BufferedReader file) throws IOException { // Parse the file CSVParser parser = CSVFormat.RFC4180.parse(file); // Get the records and record size List<CSVRecord> records = parser.getRecords(); int length = records.size(); // Verify length if (length == 0) { throw new IOException("CSV cannot be empty"); } // Get the attributes and the attribute size CSVRecord attributes = records.get(0); int size = attributes.size(); // Detect invalid attribute size if (size == 1 && attributes.get(0).equals("")) { throw new IOException("Attribute size cannot be 0"); } // Index of the first attribute int first = 0; if (attributes.get(0).equals("")) { first = 1; } // Get the attributes for (int i = first; i < size; i++) { String attribute = attributes.get(i); // Detect duplicated attribute if (!context.addToAttributes(attribute)) { throw new IOException("Duplicated attribute"); } // Detect empty attribute if ("".equals(attribute)) { throw new IOException("Empty attribute"); } } // Get the data for (int j = 1; j < length; j++) { // Get the current record CSVRecord record = records.get(j); // Detect incorrect size if (record.size() != size) { throw new IOException("Line does not have the correct number of attributes"); } // Get the observation identifier String identifier; if (first == 1) { identifier = record.get(0); } else { identifier = String.valueOf(j); } // Detect duplicated identifier if (!context.addToObservations(identifier)) { throw new IOException("Duplicated identifier"); } // Add the extent/intent for the current identifier and current attribute for (int i = first; i < size; i++) { if (record.get(i).equals("1")) { context.addExtentIntent(identifier, attributes.get(i)); } } } // Close the parser parser.close(); context.setBitSets(); }
From source file:org.thegalactic.context.io.ContextSerializerCsv.java
/** * Write a context to a csv file.// www .j av a 2 s .c o m * * The following format is respected: * * The first line contains the attribute names, the other lines contains the * observations identifier followed by boolean values * * ~~~ * "",a,b,c,d,e * 1,1,0,1,0,0 * 2,1,1,0,0,0 * 3,0,1,0,1,1 * 4,0,0,1,0,1 * ~~~ * * @param context a context to write * @param file a file * * @throws IOException When an IOException occurs */ public void write(Context context, BufferedWriter file) throws IOException { CSVPrinter printer = new CSVPrinter(file, CSVFormat.RFC4180); // Get the observations and the attributes TreeSet<Comparable> observations = context.getObservations(); TreeSet<Comparable> attributes = context.getAttributes(); // Prepare the attribute line printer.print(""); for (Comparable attribute : attributes) { // Write each attribute printer.print(attribute); } printer.println(); for (Comparable observation : observations) { // Write the observation printer.print(observation); // Write the extent/intents for (Comparable attribute : attributes) { if (context.getIntent(observation).contains(attribute)) { printer.print(1); } else { printer.print(0); } } printer.println(); } printer.close(); }
From source file:org.wso2.carbon.ml.core.utils.MLUtils.java
/** * Generate a random sample of the dataset using Spark. *//*from w w w . j a v a 2 s . com*/ public static SamplePoints getSampleFromDAS(String path, int sampleSize, String sourceType, int tenantId) throws MLMalformedDatasetException { JavaSparkContext sparkContext = null; try { Map<String, Integer> headerMap = null; // List containing actual data of the sample. List<List<String>> columnData = new ArrayList<List<String>>(); // java spark context sparkContext = MLCoreServiceValueHolder.getInstance().getSparkContext(); JavaRDD<String> lines; String headerLine = extractHeaderLine(path, tenantId); headerMap = generateHeaderMap(headerLine, CSVFormat.RFC4180); // DAS case path = table name lines = getLinesFromDASTable(path, tenantId, sparkContext); return getSamplePoints(sampleSize, true, headerMap, columnData, CSVFormat.RFC4180, lines); } catch (Exception e) { throw new MLMalformedDatasetException( "Failed to extract the sample points from path: " + path + ". Cause: " + e, e); } }
From source file:org.wso2.carbon.ml.core.utils.MLUtils.java
public static JavaRDD<String> getLinesFromDASTable(String tableName, int tenantId, JavaSparkContext sparkContext) throws AnalyticsTableNotAvailableException, AnalyticsException { JavaRDD<String> lines;/* w w w .j a v a 2 s. c om*/ String tableSchema = extractTableSchema(tableName, tenantId); SQLContext sqlCtx = new SQLContext(sparkContext); sqlCtx.sql( "CREATE TEMPORARY TABLE ML_REF USING org.wso2.carbon.analytics.spark.core.sources.AnalyticsRelationProvider " + "OPTIONS (" + "tenantId \"" + tenantId + "\", " + "tableName \"" + tableName + "\", " + "schema \"" + tableSchema + "\"" + ")"); DataFrame dataFrame = sqlCtx.sql("select * from ML_REF"); // Additional auto-generated column "_timestamp" needs to be dropped because it is not in the schema. JavaRDD<Row> rows = dataFrame.drop("_timestamp").javaRDD(); lines = rows.map(new RowsToLines.Builder().separator(CSVFormat.RFC4180.getDelimiter() + "").build()); return lines; }
From source file:org.wso2.carbon.ml.dataset.internal.DatasetSummary.java
/** * Constructor to create the parser for the data-set and initialize the lists. * * @param csvDataFile File object of the data-set CSV file. * @param datasetID Unique Identifier of the data-set. * @throws DatasetSummaryException *///from w ww . ja va 2s . c om protected DatasetSummary(File csvDataFile, String datasetID) throws DatasetSummaryException { this.datasetID = datasetID; try { Reader reader = new InputStreamReader(new FileInputStream(csvDataFile.getAbsolutePath()), DatasetConfigurations.UTF_8); this.parser = new CSVParser(reader, CSVFormat.RFC4180.withHeader().withAllowMissingColumnNames(true)); this.headerMap = this.parser.getHeaderMap(); int noOfFeatures = this.headerMap.size(); // Initialize the lists. this.missing = new int[noOfFeatures]; this.unique = new int[noOfFeatures]; this.type = new String[noOfFeatures]; this.histogram = new EmpiricalDistribution[noOfFeatures]; for (int i = 0; i < noOfFeatures; i++) { this.descriptiveStats.add(new DescriptiveStatistics()); this.graphFrequencies.add(new TreeMap<String, Integer>()); this.columnData.add(new ArrayList<String>()); } } catch (IOException e) { throw new DatasetSummaryException( "Error occured while reading from the dataset " + datasetID + ": " + e.getMessage(), e); } }
From source file:org.wso2.carbon.notebook.api.paragraph.PreprocessorEndpoint.java
/** * Process the selected the dataset/*from w w w.ja v a 2 s . co m*/ * * @param request Http servlet request * @param preprocessParameters JSON object string with parameters for pre-processing * @return response */ @POST @Path("/preprocess") public Response preprocess(@Context HttpServletRequest request, String preprocessParameters) { HttpSession session = request.getSession(); int tenantID = (Integer) session.getAttribute("tenantID"); PreprocessorRequest preprocessRequest = new Gson().fromJson(preprocessParameters, PreprocessorRequest.class); String tableName = preprocessRequest.getTableName(); String preprocessedTableName = preprocessRequest.getPreprocessedTableName(); List<Feature> featureList = preprocessRequest.getFeatureList(); List<Feature> orderedFeatureList = new ArrayList<>(); String headerLine; String jsonString; JavaRDD<String[]> preprocessedLines; GeneralResponse response; //order the features according to the schema for (int i = 0; i < featureList.size(); i++) { orderedFeatureList.add(new Feature()); } try { headerLine = MLUtils.extractHeaderLine(tableName, tenantID); for (Feature feature : featureList) { int index = MLUtils.getFeatureIndex(feature.getName(), headerLine, String.valueOf(CSVFormat.RFC4180.getDelimiter())); feature.setIndex(index); orderedFeatureList.set(index, feature); } preprocessedLines = PreprocessorUtils.preProcess(tenantID, tableName, orderedFeatureList, headerLine); PreprocessorUtils.saveTable(tenantID, tableName, preprocessedTableName, orderedFeatureList, preprocessedLines); response = new GeneralResponse(Status.SUCCESS); } catch (AnalyticsException | PreprocessorException e) { response = new ErrorResponse(e.getMessage()); } catch (RuntimeException e) { response = new ErrorResponse("Internal Server Error"); } jsonString = new Gson().toJson(response); return Response.ok(jsonString, MediaType.APPLICATION_JSON).build(); }
From source file:org.wso2.carbon.notebook.core.util.MLUtils.java
/** * Generate a random sample of the data set using Spark. * * @param tableName Name of the table/*w ww. j a v a2 s . com*/ * @param sampleSize Sample size * @param tenantId Tenant ID * @return Sample points */ public static SamplePoints getSampleFromDAS(String tableName, int sampleSize, int tenantId) throws MLMalformedDatasetException { JavaSparkContext sparkContext; try { Map<String, Integer> headerMap; // List containing actual data of the sample. List<List<String>> columnData = new ArrayList<List<String>>(); // java spark context sparkContext = ServiceHolder.getSparkContextService().getJavaSparkContext(); JavaRDD<String> lines; String headerLine = extractHeaderLine(tableName, tenantId); headerMap = generateHeaderMap(headerLine, CSVFormat.RFC4180); // DAS case path = table name lines = getLinesFromDASTable(tableName, tenantId, sparkContext); return getSamplePoints(sampleSize, true, headerMap, columnData, CSVFormat.RFC4180, lines); } catch (Exception e) { throw new MLMalformedDatasetException( "Failed to extract the sample points from path: " + tableName + ". Cause: " + e, e); } }
From source file:org.wso2.carbon.notebook.core.util.MLUtils.java
/** * Get the rows as lines of a table in the DAS * * @param tableName Name of the table * @param tenantId Tenant ID//from w w w . j av a2s . com * @param sparkContext Java spark context * @return Table rows as lines */ public static JavaRDD<String> getLinesFromDASTable(String tableName, int tenantId, JavaSparkContext sparkContext) throws AnalyticsException { JavaRDD<String> lines; String tableSchema = extractTableSchema(tableName, tenantId); SQLContext sqlCtx = new SQLContext(sparkContext); sqlCtx.sql( "CREATE TEMPORARY TABLE ML_REF USING org.wso2.carbon.analytics.spark.core.sources.AnalyticsRelationProvider " + "OPTIONS (" + "tenantId \"" + tenantId + "\", " + "tableName \"" + tableName + "\", " + "schema \"" + tableSchema + "\"" + ")"); DataFrame dataFrame = sqlCtx.sql("select * from ML_REF"); // Additional auto-generated column "_timestamp" needs to be dropped because it is not in the schema. JavaRDD<Row> rows = dataFrame.drop("_timestamp").javaRDD(); lines = rows.map(new RowsToLines.Builder().separator(CSVFormat.RFC4180.getDelimiter() + "").build()); return lines; }