Example usage for org.apache.commons.csv CSVFormat RFC4180

Introduction

In this page you can find the example usage for org.apache.commons.csv CSVFormat RFC4180.

Prototype

CSVFormat RFC4180

To view the source code for org.apache.commons.csv CSVFormat RFC4180.

Click Source Link

Document

Comma separated format as defined by <a href="http://tools.ietf.org/html/rfc4180">RFC 4180</a>.

Usage

From source file:org.talend.components.localio.runtime.fixed.FixedDatasetRuntime.java

@Override
public Schema getSchema() {
    switch (properties.format.getValue()) {
    case CSV:// w  w w  .  j av  a 2s  . c  o m
        // Try to get the schema from the specified value.
        String csvSchema = properties.csvSchema.getValue();
        if (!csvSchema.trim().isEmpty()) {
            try {
                CSVRecord r = CSVFormat.RFC4180 //
                        .withDelimiter(properties.getFieldDelimiter().charAt(0)) //
                        .withRecordSeparator(properties.getRecordDelimiter()) //
                        .parse(new StringReader(csvSchema)).iterator().next();
                return CsvRecordToIndexedRecordConverter.inferSchema(r);
            } catch (Exception e) {
                throw LocalIOErrorCode.createCannotParseSchema(e, csvSchema);
            }
        }
        // Fall back to a schema based on the number of columns.
        try {
            int maxSize = 0;
            for (CSVRecord r : CSVFormat.RFC4180 //
                    .withDelimiter(properties.getFieldDelimiter().charAt(0)) //
                    .withRecordSeparator(properties.getRecordDelimiter())
                    .parse(new StringReader(properties.values.getValue())))
                maxSize = Math.max(maxSize, r.size());
            if (maxSize == 0)
                throw LocalIOErrorCode.requireAtLeastOneRecord(new RuntimeException());
            return CsvRecordToIndexedRecordConverter.inferSchema(maxSize);
        } catch (IOException e) {
            throw LocalIOErrorCode.createCannotParseSchema(e, properties.values.getValue());
        }
    case JSON:
        if (properties.values.getValue().trim().isEmpty())
            throw LocalIOErrorCode.requireAtLeastOneRecord(new RuntimeException());
        return getValues(1).get(0).getSchema();
    case AVRO:
        try {
            return new Schema.Parser().parse(properties.schema.getValue());
        } catch (Exception e) {
            throw LocalIOErrorCode.createCannotParseSchema(e, properties.schema.getValue());
        }
    }
    throw LocalIOErrorCode.createCannotParseSchema(null, properties.schema.getValue());
}

From source file:org.talend.components.localio.runtime.fixed.FixedDatasetRuntime.java

public List<IndexedRecord> getValues(int limit) {
    List<IndexedRecord> values = new ArrayList<>();
    switch (properties.format.getValue()) {
    case CSV://from w  ww. j  av  a 2s  .  c o  m
        try {
            CsvRecordToIndexedRecordConverter converter = new CsvRecordToIndexedRecordConverter(getSchema());
            for (CSVRecord r : CSVFormat.RFC4180 //
                    .withDelimiter(properties.getFieldDelimiter().charAt(0)) //
                    .withRecordSeparator(properties.getRecordDelimiter())
                    .parse(new StringReader(properties.values.getValue())))
                values.add(converter.convertToAvro(r));
        } catch (IOException e) {
            throw LocalIOErrorCode.createCannotParseSchema(e, properties.values.getValue());
        }
        break;
    case JSON:
        ObjectMapper mapper = new ObjectMapper();
        JsonSchemaInferrer jsonSchemaInferrer = new JsonSchemaInferrer(mapper);
        JsonGenericRecordConverter converter = null;
        JsonFactory jsonFactory = new JsonFactory();
        try (StringReader r = new StringReader(properties.values.getValue())) {
            Iterator<JsonNode> value = mapper.readValues(jsonFactory.createParser(r), JsonNode.class);
            int count = 0;
            while (value.hasNext() && count++ < limit) {
                String json = value.next().toString();
                if (converter == null) {
                    Schema jsonSchema = jsonSchemaInferrer.inferSchema(json);
                    converter = new JsonGenericRecordConverter(jsonSchema);
                }
                values.add(converter.convertToAvro(json));
            }
        } catch (IOException e) {
            throw LocalIOErrorCode.createCannotParseJson(e, properties.schema.getValue(),
                    properties.values.getValue());
        }
        break;
    case AVRO:
        Schema schema = getSchema();
        if (isRandom()) {
            GeneratorFunction<IndexedRecord> gf = (GeneratorFunction<IndexedRecord>) GeneratorFunctions
                    .of(getSchema());
            GeneratorFunction.GeneratorContext ctx = GeneratorFunction.GeneratorContext.of(0, 0L);
            for (int i = 0; i < limit; i++) {
                ctx.setRowId(i);
                values.add(gf.apply(ctx));
            }
        } else {
            try (ByteArrayInputStream bais = new ByteArrayInputStream(
                    properties.values.getValue().trim().getBytes())) {
                JsonDecoder decoder = DecoderFactory.get().jsonDecoder(schema, bais);
                DatumReader<IndexedRecord> reader = new GenericDatumReader<>(schema);
                int count = 0;
                while (count++ < limit) {
                    values.add(reader.read(null, decoder));
                }
            } catch (EOFException e) {
                // Indicates the end of the values.
            } catch (IOException e) {
                throw LocalIOErrorCode.createCannotParseAvroJson(e, properties.schema.getValue(),
                        properties.values.getValue());
            }
        }
        break;
    }
    return values;
}

From source file:org.thegalactic.context.io.ContextSerializerCsv.java

/**
 * Read a context from a csv file./*from  w  w  w.j av a 2s  .  c  om*/
 *
 * The following format is respected:
 *
 * The first line contains the attribute names, the other lines contains the
 * observations identifier followed by boolean values
 *
 * ~~~
 * "",a,b,c,d,e
 * 1,1,0,1,0,0
 * 2,1,1,0,0,0
 * 3,0,1,0,1,1
 * 4,0,0,1,0,1
 * ~~~
 *
 * If the first attribute is the empty string, the first column corresponds
 * to the individual identifiers. In the other case, the individual
 * identifiers will be generated by successive integers.
 *
 * ~~~
 * a,b,c,d,e
 * 1,0,1,0,0
 * 1,1,0,0,0
 * 0,1,0,1,1
 * 0,0,1,0,1
 * ~~~
 *
 * @param context a context to read
 * @param file    a file
 *
 * @throws IOException When an IOException occurs
 */
public void read(Context context, BufferedReader file) throws IOException {
    // Parse the file
    CSVParser parser = CSVFormat.RFC4180.parse(file);

    // Get the records and record size
    List<CSVRecord> records = parser.getRecords();
    int length = records.size();

    // Verify length
    if (length == 0) {
        throw new IOException("CSV cannot be empty");
    }

    // Get the attributes and the attribute size
    CSVRecord attributes = records.get(0);
    int size = attributes.size();

    // Detect invalid attribute size
    if (size == 1 && attributes.get(0).equals("")) {
        throw new IOException("Attribute size cannot be 0");
    }

    // Index of the first attribute
    int first = 0;
    if (attributes.get(0).equals("")) {
        first = 1;
    }

    // Get the attributes
    for (int i = first; i < size; i++) {
        String attribute = attributes.get(i);

        // Detect duplicated attribute
        if (!context.addToAttributes(attribute)) {
            throw new IOException("Duplicated attribute");
        }

        // Detect empty attribute
        if ("".equals(attribute)) {
            throw new IOException("Empty attribute");
        }
    }

    // Get the data
    for (int j = 1; j < length; j++) {
        // Get the current record
        CSVRecord record = records.get(j);

        // Detect incorrect size
        if (record.size() != size) {
            throw new IOException("Line does not have the correct number of attributes");
        }

        // Get the observation identifier
        String identifier;
        if (first == 1) {
            identifier = record.get(0);
        } else {
            identifier = String.valueOf(j);
        }

        // Detect duplicated identifier
        if (!context.addToObservations(identifier)) {
            throw new IOException("Duplicated identifier");
        }

        // Add the extent/intent for the current identifier and current attribute
        for (int i = first; i < size; i++) {
            if (record.get(i).equals("1")) {
                context.addExtentIntent(identifier, attributes.get(i));
            }
        }
    }

    // Close the parser
    parser.close();
    context.setBitSets();
}

From source file:org.thegalactic.context.io.ContextSerializerCsv.java

/**
 * Write a context to a csv file.// www  .j av  a  2 s  .c o m
 *
 * The following format is respected:
 *
 * The first line contains the attribute names, the other lines contains the
 * observations identifier followed by boolean values
 *
 * ~~~
 * "",a,b,c,d,e
 * 1,1,0,1,0,0
 * 2,1,1,0,0,0
 * 3,0,1,0,1,1
 * 4,0,0,1,0,1
 * ~~~
 *
 * @param context a context to write
 * @param file    a file
 *
 * @throws IOException When an IOException occurs
 */
public void write(Context context, BufferedWriter file) throws IOException {
    CSVPrinter printer = new CSVPrinter(file, CSVFormat.RFC4180);

    // Get the observations and the attributes
    TreeSet<Comparable> observations = context.getObservations();
    TreeSet<Comparable> attributes = context.getAttributes();

    // Prepare the attribute line
    printer.print("");

    for (Comparable attribute : attributes) {
        // Write each attribute
        printer.print(attribute);
    }

    printer.println();

    for (Comparable observation : observations) {
        // Write the observation
        printer.print(observation);

        // Write the extent/intents
        for (Comparable attribute : attributes) {
            if (context.getIntent(observation).contains(attribute)) {
                printer.print(1);
            } else {
                printer.print(0);
            }
        }

        printer.println();
    }

    printer.close();
}

From source file:org.wso2.carbon.ml.core.utils.MLUtils.java

/**
 * Generate a random sample of the dataset using Spark.
 *//*from   w w  w .  j a v a  2 s  . com*/
public static SamplePoints getSampleFromDAS(String path, int sampleSize, String sourceType, int tenantId)
        throws MLMalformedDatasetException {

    JavaSparkContext sparkContext = null;
    try {
        Map<String, Integer> headerMap = null;
        // List containing actual data of the sample.
        List<List<String>> columnData = new ArrayList<List<String>>();

        // java spark context
        sparkContext = MLCoreServiceValueHolder.getInstance().getSparkContext();
        JavaRDD<String> lines;
        String headerLine = extractHeaderLine(path, tenantId);
        headerMap = generateHeaderMap(headerLine, CSVFormat.RFC4180);

        // DAS case path = table name
        lines = getLinesFromDASTable(path, tenantId, sparkContext);

        return getSamplePoints(sampleSize, true, headerMap, columnData, CSVFormat.RFC4180, lines);

    } catch (Exception e) {
        throw new MLMalformedDatasetException(
                "Failed to extract the sample points from path: " + path + ". Cause: " + e, e);
    }
}

From source file:org.wso2.carbon.ml.core.utils.MLUtils.java

public static JavaRDD<String> getLinesFromDASTable(String tableName, int tenantId,
        JavaSparkContext sparkContext) throws AnalyticsTableNotAvailableException, AnalyticsException {
    JavaRDD<String> lines;/* w w w  .j  a v a  2  s. c  om*/
    String tableSchema = extractTableSchema(tableName, tenantId);
    SQLContext sqlCtx = new SQLContext(sparkContext);
    sqlCtx.sql(
            "CREATE TEMPORARY TABLE ML_REF USING org.wso2.carbon.analytics.spark.core.sources.AnalyticsRelationProvider "
                    + "OPTIONS (" + "tenantId \"" + tenantId + "\", " + "tableName \"" + tableName + "\", "
                    + "schema \"" + tableSchema + "\"" + ")");

    DataFrame dataFrame = sqlCtx.sql("select * from ML_REF");
    // Additional auto-generated column "_timestamp" needs to be dropped because it is not in the schema.
    JavaRDD<Row> rows = dataFrame.drop("_timestamp").javaRDD();
    lines = rows.map(new RowsToLines.Builder().separator(CSVFormat.RFC4180.getDelimiter() + "").build());
    return lines;
}

From source file:org.wso2.carbon.ml.dataset.internal.DatasetSummary.java

/**
 * Constructor to create the parser for the data-set and initialize the lists.
 *
 * @param csvDataFile   File object of the data-set CSV file.
 * @param datasetID     Unique Identifier of the data-set.
 * @throws              DatasetSummaryException
 *///from   w  ww .  ja va  2s  . c  om
protected DatasetSummary(File csvDataFile, String datasetID) throws DatasetSummaryException {
    this.datasetID = datasetID;
    try {
        Reader reader = new InputStreamReader(new FileInputStream(csvDataFile.getAbsolutePath()),
                DatasetConfigurations.UTF_8);
        this.parser = new CSVParser(reader, CSVFormat.RFC4180.withHeader().withAllowMissingColumnNames(true));
        this.headerMap = this.parser.getHeaderMap();
        int noOfFeatures = this.headerMap.size();
        // Initialize the lists.
        this.missing = new int[noOfFeatures];
        this.unique = new int[noOfFeatures];
        this.type = new String[noOfFeatures];
        this.histogram = new EmpiricalDistribution[noOfFeatures];
        for (int i = 0; i < noOfFeatures; i++) {
            this.descriptiveStats.add(new DescriptiveStatistics());
            this.graphFrequencies.add(new TreeMap<String, Integer>());
            this.columnData.add(new ArrayList<String>());
        }
    } catch (IOException e) {
        throw new DatasetSummaryException(
                "Error occured while reading from the dataset " + datasetID + ": " + e.getMessage(), e);
    }
}

From source file:org.wso2.carbon.notebook.api.paragraph.PreprocessorEndpoint.java

/**
 * Process the selected the dataset/*from   w  w  w.ja v  a 2  s  .  co  m*/
 *
 * @param request              Http servlet request
 * @param preprocessParameters JSON object string with parameters for pre-processing
 * @return response
 */
@POST
@Path("/preprocess")
public Response preprocess(@Context HttpServletRequest request, String preprocessParameters) {
    HttpSession session = request.getSession();
    int tenantID = (Integer) session.getAttribute("tenantID");
    PreprocessorRequest preprocessRequest = new Gson().fromJson(preprocessParameters,
            PreprocessorRequest.class);
    String tableName = preprocessRequest.getTableName();
    String preprocessedTableName = preprocessRequest.getPreprocessedTableName();
    List<Feature> featureList = preprocessRequest.getFeatureList();
    List<Feature> orderedFeatureList = new ArrayList<>();
    String headerLine;
    String jsonString;
    JavaRDD<String[]> preprocessedLines;
    GeneralResponse response;

    //order the features according to the schema
    for (int i = 0; i < featureList.size(); i++) {
        orderedFeatureList.add(new Feature());
    }

    try {
        headerLine = MLUtils.extractHeaderLine(tableName, tenantID);
        for (Feature feature : featureList) {
            int index = MLUtils.getFeatureIndex(feature.getName(), headerLine,
                    String.valueOf(CSVFormat.RFC4180.getDelimiter()));
            feature.setIndex(index);
            orderedFeatureList.set(index, feature);
        }

        preprocessedLines = PreprocessorUtils.preProcess(tenantID, tableName, orderedFeatureList, headerLine);
        PreprocessorUtils.saveTable(tenantID, tableName, preprocessedTableName, orderedFeatureList,
                preprocessedLines);
        response = new GeneralResponse(Status.SUCCESS);
    } catch (AnalyticsException | PreprocessorException e) {
        response = new ErrorResponse(e.getMessage());
    } catch (RuntimeException e) {
        response = new ErrorResponse("Internal Server Error");
    }

    jsonString = new Gson().toJson(response);
    return Response.ok(jsonString, MediaType.APPLICATION_JSON).build();
}

From source file:org.wso2.carbon.notebook.core.util.MLUtils.java

/**
 * Generate a random sample of the data set using Spark.
 *
 * @param tableName  Name of the table/*w  ww.  j a v a2  s  .  com*/
 * @param sampleSize Sample size
 * @param tenantId   Tenant ID
 * @return Sample points
 */
public static SamplePoints getSampleFromDAS(String tableName, int sampleSize, int tenantId)
        throws MLMalformedDatasetException {
    JavaSparkContext sparkContext;
    try {
        Map<String, Integer> headerMap;
        // List containing actual data of the sample.
        List<List<String>> columnData = new ArrayList<List<String>>();

        // java spark context
        sparkContext = ServiceHolder.getSparkContextService().getJavaSparkContext();
        JavaRDD<String> lines;
        String headerLine = extractHeaderLine(tableName, tenantId);
        headerMap = generateHeaderMap(headerLine, CSVFormat.RFC4180);

        // DAS case path = table name
        lines = getLinesFromDASTable(tableName, tenantId, sparkContext);

        return getSamplePoints(sampleSize, true, headerMap, columnData, CSVFormat.RFC4180, lines);

    } catch (Exception e) {
        throw new MLMalformedDatasetException(
                "Failed to extract the sample points from path: " + tableName + ". Cause: " + e, e);
    }
}

From source file:org.wso2.carbon.notebook.core.util.MLUtils.java

/**
 * Get the rows as lines of a table in the DAS
 *
 * @param tableName    Name of the table
 * @param tenantId     Tenant ID//from  w w  w  . j av  a2s  .  com
 * @param sparkContext Java spark context
 * @return Table rows as lines
 */
public static JavaRDD<String> getLinesFromDASTable(String tableName, int tenantId,
        JavaSparkContext sparkContext) throws AnalyticsException {
    JavaRDD<String> lines;
    String tableSchema = extractTableSchema(tableName, tenantId);
    SQLContext sqlCtx = new SQLContext(sparkContext);
    sqlCtx.sql(
            "CREATE TEMPORARY TABLE ML_REF USING org.wso2.carbon.analytics.spark.core.sources.AnalyticsRelationProvider "
                    + "OPTIONS (" + "tenantId \"" + tenantId + "\", " + "tableName \"" + tableName + "\", "
                    + "schema \"" + tableSchema + "\"" + ")");

    DataFrame dataFrame = sqlCtx.sql("select * from ML_REF");
    // Additional auto-generated column "_timestamp" needs to be dropped because it is not in the schema.
    JavaRDD<Row> rows = dataFrame.drop("_timestamp").javaRDD();

    lines = rows.map(new RowsToLines.Builder().separator(CSVFormat.RFC4180.getDelimiter() + "").build());
    return lines;
}