Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.parser.csv; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.nio.charset.Charset; import java.nio.charset.UnsupportedCharsetException; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVRecord; import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.tika.config.Field; import org.apache.tika.detect.AutoDetectReader; import org.apache.tika.detect.EncodingDetector; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractEncodingDetectorParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; /** * Unless the {@link TikaCoreProperties#CONTENT_TYPE_OVERRIDE} is set, * this parser tries to assess whether the file is a text file, csv or tsv. * If the detector detects regularity in column numbers and/or encapsulated cells, * this parser will apply the {@link org.apache.commons.csv.CSVParser}; * otherwise, it will treat the contents as text. * <p> * If there is a csv parse exception during detection, the parser sets * the {@link Metadata#CONTENT_TYPE} to {@link MediaType#TEXT_PLAIN} * and treats the file as {@link MediaType#TEXT_PLAIN}. * </p> * <p> * If there is a csv parse exception during the parse, the parser * writes what's left of the stream as if it were text and then throws * an exception. As of this writing, the content that was buffered by the underlying * {@link org.apache.commons.csv.CSVParser} is lost. * </p> */ public class TextAndCSVParser extends AbstractEncodingDetectorParser { private static final String CSV_PREFIX = "csv"; private static final String CHARSET = "charset"; private static final String DELIMITER = "delimiter"; public static final Property DELIMITER_PROPERTY = Property .externalText(CSV_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + DELIMITER); private static final String TD = "td"; private static final String TR = "tr"; private static final String TABLE = "table"; static final MediaType CSV = MediaType.text("csv"); static final MediaType TSV = MediaType.text("tsv"); private static final int DEFAULT_MARK_LIMIT = 20000; private static final char[] DEFAULT_DELIMITERS = new char[] { ',', '\t' }; private static final Map<Character, String> CHAR_TO_STRING_DELIMITER_MAP = new HashMap<>(); private static final Map<String, Character> STRING_TO_CHAR_DELIMITER_MAP = new HashMap<>(); static { CHAR_TO_STRING_DELIMITER_MAP.put(',', "comma"); CHAR_TO_STRING_DELIMITER_MAP.put('\t', "tab"); CHAR_TO_STRING_DELIMITER_MAP.put('|', "pipe"); CHAR_TO_STRING_DELIMITER_MAP.put(';', "semicolon"); CHAR_TO_STRING_DELIMITER_MAP.put(':', "colon"); } static { for (Map.Entry<Character, String> e : CHAR_TO_STRING_DELIMITER_MAP.entrySet()) { STRING_TO_CHAR_DELIMITER_MAP.put(e.getValue(), e.getKey()); } } private static final Set<MediaType> SUPPORTED_TYPES = Collections .unmodifiableSet(new HashSet<MediaType>(Arrays.asList(CSV, TSV, MediaType.TEXT_PLAIN))); private char[] delimiters = DEFAULT_DELIMITERS; /** * This is the mark limit in characters (not bytes) to * read from the stream when classifying the stream as * csv, tsv or txt. */ @Field private int markLimit = DEFAULT_MARK_LIMIT; public TextAndCSVParser() { super(); } public TextAndCSVParser(EncodingDetector encodingDetector) { super(encodingDetector); } @Override public Set<MediaType> getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { CSVParams params = getOverride(metadata); Reader reader = null; Charset charset = null; if (!params.isComplete()) { reader = detect(params, stream, metadata, context); if (params.getCharset() != null) { charset = params.getCharset(); } else { charset = ((AutoDetectReader) reader).getCharset(); } } else { reader = new BufferedReader(new InputStreamReader(stream, params.getCharset())); charset = params.getCharset(); } updateMetadata(params, metadata); //if text or a non-csv/tsv category of text //treat this as text and be done //TODO -- if it was detected as a non-csv subtype of text if (!params.getMediaType().getBaseType().equals(CSV) && !params.getMediaType().getBaseType().equals(TSV)) { handleText(reader, charset, handler, metadata); return; } CSVFormat csvFormat = CSVFormat.EXCEL.withDelimiter(params.getDelimiter()); metadata.set(DELIMITER_PROPERTY, CHAR_TO_STRING_DELIMITER_MAP.get(csvFormat.getDelimiter())); XHTMLContentHandler xhtmlContentHandler = new XHTMLContentHandler(handler, metadata); try (org.apache.commons.csv.CSVParser commonsParser = new org.apache.commons.csv.CSVParser(reader, csvFormat)) { xhtmlContentHandler.startDocument(); xhtmlContentHandler.startElement(TABLE); try { for (CSVRecord row : commonsParser) { xhtmlContentHandler.startElement(TR); for (String cell : row) { xhtmlContentHandler.startElement(TD); xhtmlContentHandler.characters(cell); xhtmlContentHandler.endElement(TD); } xhtmlContentHandler.endElement(TR); } } catch (IllegalStateException e) { //if there's a parse exception //try to get the rest of the content...treat it as text for now //There will be some content lost because of buffering. //TODO -- figure out how to improve this xhtmlContentHandler.endElement(TABLE); xhtmlContentHandler.startElement("div", "name", "after exception"); handleText(reader, xhtmlContentHandler); xhtmlContentHandler.endElement("div"); xhtmlContentHandler.endDocument(); //TODO -- consider dumping what's left in the reader as text throw new TikaException("exception parsing the csv", e); } xhtmlContentHandler.endElement(TABLE); xhtmlContentHandler.endDocument(); } } private void handleText(Reader reader, Charset charset, ContentHandler handler, Metadata metadata) throws SAXException, IOException, TikaException { // Automatically detect the character encoding //try to get detected content type; could be a subclass of text/plain //such as vcal, etc. String incomingMime = metadata.get(Metadata.CONTENT_TYPE); MediaType mediaType = MediaType.TEXT_PLAIN; if (incomingMime != null) { MediaType tmpMediaType = MediaType.parse(incomingMime); if (tmpMediaType != null) { mediaType = tmpMediaType; } } MediaType type = new MediaType(mediaType, charset); metadata.set(Metadata.CONTENT_TYPE, type.toString()); // deprecated, see TIKA-431 metadata.set(Metadata.CONTENT_ENCODING, charset.name()); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); handleText(reader, xhtml); xhtml.endDocument(); } private static void handleText(Reader reader, XHTMLContentHandler xhtml) throws SAXException, IOException { xhtml.startElement("p"); char[] buffer = new char[4096]; int n = reader.read(buffer); while (n != -1) { xhtml.characters(buffer, 0, n); n = reader.read(buffer); } xhtml.endElement("p"); } private Reader detect(CSVParams params, InputStream stream, Metadata metadata, ParseContext context) throws IOException, TikaException { //if the file was already identified as not .txt, .csv or .tsv //don't even try to csv or not String mediaString = metadata.get(Metadata.CONTENT_TYPE); if (mediaString != null) { MediaType mediaType = MediaType.parse(mediaString); if (!SUPPORTED_TYPES.contains(mediaType.getBaseType())) { params.setMediaType(mediaType); return new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context)); } } Reader reader = null; if (params.getCharset() == null) { reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context)); params.setCharset(((AutoDetectReader) reader).getCharset()); if (params.isComplete()) { return reader; } } else { reader = new BufferedReader( new InputStreamReader(new CloseShieldInputStream(stream), params.getCharset())); } if (params.getDelimiter() == null && (params.getMediaType() == null || isCSVOrTSV(params.getMediaType()))) { CSVSniffer sniffer = new CSVSniffer(markLimit, delimiters); CSVResult result = sniffer.getBest(reader, metadata); params.setMediaType(result.getMediaType()); params.setDelimiter(result.getDelimiter()); } return reader; } private CSVParams getOverride(Metadata metadata) { String override = metadata.get(TikaCoreProperties.CONTENT_TYPE_OVERRIDE); if (override == null) { return new CSVParams(); } MediaType mediaType = MediaType.parse(override); if (mediaType == null) { return new CSVParams(); } String charsetString = mediaType.getParameters().get(CHARSET); Charset charset = null; if (charsetString != null) { try { charset = Charset.forName(charsetString); } catch (UnsupportedCharsetException e) { } } if (!isCSVOrTSV(mediaType)) { return new CSVParams(mediaType, charset); } String delimiterString = mediaType.getParameters().get(DELIMITER); if (delimiterString == null) { return new CSVParams(mediaType, charset); } if (STRING_TO_CHAR_DELIMITER_MAP.containsKey(delimiterString)) { return new CSVParams(mediaType, charset, (char) STRING_TO_CHAR_DELIMITER_MAP.get(delimiterString)); } if (delimiterString.length() == 1) { return new CSVParams(mediaType, charset, delimiterString.charAt(0)); } //TODO: log bad/unrecognized delimiter string return new CSVParams(mediaType, charset); } static boolean isCSVOrTSV(MediaType mediaType) { if (mediaType == null) { return false; } if (mediaType.getBaseType().equals(TSV) || mediaType.getBaseType().equals(CSV)) { return true; } return false; } private void updateMetadata(CSVParams params, Metadata metadata) { MediaType mediaType = null; if (params.getMediaType().getBaseType().equals(MediaType.TEXT_PLAIN)) { mediaType = MediaType.TEXT_PLAIN; } else if (params.getDelimiter() != null) { if (params.getDelimiter() == '\t') { mediaType = TSV; } else { mediaType = CSV; } } else { if (metadata.get(Metadata.CONTENT_TYPE) != null) { mediaType = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE)); } } Map<String, String> attrs = new HashMap<>(); if (params.getCharset() != null) { attrs.put(CHARSET, params.getCharset().name()); // deprecated, see TIKA-431 metadata.set(Metadata.CONTENT_ENCODING, params.getCharset().name()); } if (!mediaType.equals(MediaType.TEXT_PLAIN) && params.getDelimiter() != null) { if (CHAR_TO_STRING_DELIMITER_MAP.containsKey(params.getDelimiter())) { attrs.put(DELIMITER, CHAR_TO_STRING_DELIMITER_MAP.get(params.getDelimiter())); } else { attrs.put(DELIMITER, Integer.toString((int) params.getDelimiter().charValue())); } } MediaType type = new MediaType(mediaType, attrs); metadata.set(Metadata.CONTENT_TYPE, type.toString()); } }