Java tutorial
// ============================================================================ // // Copyright (C) 2006-2016 Talend Inc. - www.talend.com // // This source code is available under agreement available at // https://github.com/Talend/data-prep/blob/master/LICENSE // // You should have received a copy of the agreement // along with this program; if not, write to Talend SA // 9 rue Pages 92150 Suresnes, France // // ============================================================================ package org.talend.dataprep.schema.csv; import java.io.*; import java.util.List; import java.util.Map; import javax.annotation.Resource; import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.core.task.TaskExecutor; import org.springframework.stereotype.Service; import org.talend.dataprep.api.dataset.ColumnMetadata; import org.talend.dataprep.api.dataset.DataSetMetadata; import org.talend.dataprep.exception.TDPException; import org.talend.dataprep.exception.error.CommonErrorCodes; import org.talend.dataprep.schema.Serializer; import com.fasterxml.jackson.core.JsonFactory; import com.fasterxml.jackson.core.JsonGenerator; import au.com.bytecode.opencsv.CSVReader; @Service("serializer#csv") public class CSVSerializer implements Serializer { private static final Logger LOGGER = LoggerFactory.getLogger(CSVSerializer.class); @Resource(name = "serializer#csv#executor") TaskExecutor executor; @Override public InputStream serialize(InputStream rawContent, DataSetMetadata metadata, long limit) { try { PipedInputStream pipe = new PipedInputStream(); PipedOutputStream jsonOutput = new PipedOutputStream(pipe); // Serialize asynchronously for better performance (especially if caller doesn't consume all, see sampling). Runnable r = () -> { final Map<String, String> parameters = metadata.getContent().getParameters(); final String separator = parameters.get(CSVFormatFamily.SEPARATOR_PARAMETER); try (CSVReader reader = new CSVReader(new InputStreamReader(rawContent, metadata.getEncoding()), separator.charAt(0), '\"', '\0')) { JsonGenerator generator = new JsonFactory().createGenerator(jsonOutput); int i = 0; while (i++ < metadata.getContent().getNbLinesInHeader()) { reader.readNext(); // Skip all header lines } generator.writeStartArray(); writeLineContent(reader, metadata, generator, separator, limit); generator.writeEndArray(); generator.flush(); } catch (Exception e) { // Consumer may very well interrupt consumption of stream (in case of limit(n) use for sampling). // This is not an issue as consumer is allowed to partially consumes results, it's up to the // consumer to ensure data it consumed is consistent. LOGGER.debug("Unable to continue serialization for {}. Skipping remaining content.", metadata.getId(), e); } finally { try { jsonOutput.close(); } catch (IOException e) { LOGGER.error("Unable to close output", e); } } }; executor.execute(r); return pipe; } catch (IOException e) { throw new TDPException(CommonErrorCodes.UNABLE_TO_SERIALIZE_TO_JSON, e); } } /** * Write the line content. * * @param reader the csv reader to use as data source. * @param metadata the dataset metadata to use to get the columns. * @param generator the json generator used to actually write the line content. * @param separator the csv separator to use. * @param limit The maximum number of lines in the exported content. * @throws IOException if an error occurs. */ private void writeLineContent(CSVReader reader, DataSetMetadata metadata, JsonGenerator generator, String separator, long limit) throws IOException { String[] line; int current = 0; while ((line = reader.readNext()) != null && withinLimit(limit, current)) { // skip empty lines if (line.length == 1 && (StringUtils.isEmpty(line[0]) || line[0].charAt(0) == '\u0000')) { continue; } List<ColumnMetadata> columns = metadata.getRowMetadata().getColumns(); generator.writeStartObject(); int columnsSize = columns.size(); for (int i = 0; i < columnsSize; i++) { ColumnMetadata columnMetadata = columns.get(i); generator.writeFieldName(columnMetadata.getId()); // deal with additional content (line.length > columns.size) if (i == columnsSize - 1 && line.length > columnsSize) { String additionalContent = getRemainingColumns(line, i, separator); generator.writeString(cleanCharacters(additionalContent)); } // deal with fewer content (line.length < columns.size) else if (i < line.length && line[i] != null) { generator.writeString(cleanCharacters(line[i])); } // deal with null else { generator.writeNull(); } } generator.writeEndObject(); current++; } } private boolean withinLimit(long limit, int current) { return limit < 0 || current < limit; } private String cleanCharacters(final String value) { return StringUtils.remove(value, '\u0000'); } /** * Return the remaining raw (with separators) content of the column. * * @param line the line to parse. * @param start where to start in the line. * @param separator the separator to append. * @return the remaining raw (with separators) content of the column. */ private String getRemainingColumns(String[] line, int start, String separator) { StringBuilder buffer = new StringBuilder(); for (int j = start; j < line.length; j++) { buffer.append(line[j]); if (j < line.length - 1) { buffer.append(separator); } } return buffer.toString(); } }