Java tutorial
/* * Copyright 2012-2015 CodeLibs Project and the Others. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, * either express or implied. See the License for the specific language * governing permissions and limitations under the License. */ package org.codelibs.robot.extractor.impl; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.PrintStream; import java.io.StringWriter; import java.util.Arrays; import java.util.HashMap; import java.util.Map; import javax.annotation.PostConstruct; import javax.annotation.Resource; import org.apache.commons.io.IOUtils; import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.HttpHeaders; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaMetadataKeys; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.SecureContentHandler; import org.codelibs.core.io.CopyUtil; import org.codelibs.core.lang.StringUtil; import org.codelibs.robot.Constants; import org.codelibs.robot.container.RobotContainer; import org.codelibs.robot.entity.ExtractData; import org.codelibs.robot.exception.ExtractException; import org.codelibs.robot.exception.RobotSystemException; import org.codelibs.robot.extractor.Extractor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; /** * @author shinsuke * */ public class TikaExtractor implements Extractor { private static final Logger logger = LoggerFactory // NOPMD .getLogger(TikaExtractor.class); @Resource protected RobotContainer robotContainer; public String outputEncoding = Constants.UTF_8; public boolean readAsTextIfFailed = true; public long maxCompressionRatio = 100; public long maxUncompressionSize = 1000000; public int initialBufferSize = 10000; public TikaConfig tikaConfig; protected Map<String, String> pdfPasswordMap = new HashMap<String, String>(); @PostConstruct public void init() { if (tikaConfig == null) { tikaConfig = TikaConfig.getDefaultConfig(); } if (logger.isDebugEnabled()) { final Parser parser = tikaConfig.getParser(); logger.debug("supportedTypes: {}", parser.getSupportedTypes(new ParseContext())); } } @Override public ExtractData getText(final InputStream inputStream, final Map<String, String> params) { if (inputStream == null) { throw new RobotSystemException("The inputstream is null."); } File tempFile = null; try { tempFile = File.createTempFile("tikaExtractor-", ".out"); } catch (final IOException e) { throw new ExtractException("Could not create a temp file.", e); } try { try (OutputStream out = new FileOutputStream(tempFile)) { CopyUtil.copy(inputStream, out); } InputStream in = new FileInputStream(tempFile); final PrintStream originalOutStream = System.out; final ByteArrayOutputStream outStream = new ByteArrayOutputStream(); System.setOut(new PrintStream(outStream, true)); final PrintStream originalErrStream = System.err; final ByteArrayOutputStream errStream = new ByteArrayOutputStream(); System.setErr(new PrintStream(errStream, true)); try { final String resourceName = params == null ? null : params.get(TikaMetadataKeys.RESOURCE_NAME_KEY); final String contentType = params == null ? null : params.get(HttpHeaders.CONTENT_TYPE); String contentEncoding = params == null ? null : params.get(HttpHeaders.CONTENT_ENCODING); // password for pdf String pdfPassword = params == null ? null : params.get(ExtractData.PDF_PASSWORD); if (pdfPassword == null && params != null) { pdfPassword = getPdfPassword(params.get(ExtractData.URL), resourceName); } final Metadata metadata = createMetadata(resourceName, contentType, contentEncoding, pdfPassword); final Parser parser = new DetectParser(); final ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, parser); final StringWriter writer = new StringWriter(initialBufferSize); parser.parse(in, new BodyContentHandler(writer), metadata, parseContext); String content = normalizeContent(writer); if (StringUtil.isBlank(content)) { if (resourceName != null) { IOUtils.closeQuietly(in); if (logger.isDebugEnabled()) { logger.debug("retry without a resource name: {}", resourceName); } in = new FileInputStream(tempFile); final Metadata metadata2 = createMetadata(null, contentType, contentEncoding, pdfPassword); final StringWriter writer2 = new StringWriter(initialBufferSize); parser.parse(in, new BodyContentHandler(writer2), metadata2, parseContext); content = normalizeContent(writer2); } if (StringUtil.isBlank(content) && contentType != null) { IOUtils.closeQuietly(in); if (logger.isDebugEnabled()) { logger.debug("retry without a content type: {}", contentType); } in = new FileInputStream(tempFile); final Metadata metadata3 = createMetadata(null, null, contentEncoding, pdfPassword); final StringWriter writer3 = new StringWriter(initialBufferSize); parser.parse(in, new BodyContentHandler(writer3), metadata3, parseContext); content = normalizeContent(writer3); } if (readAsTextIfFailed && StringUtil.isBlank(content)) { IOUtils.closeQuietly(in); if (logger.isDebugEnabled()) { logger.debug("read the content as a text."); } if (contentEncoding == null) { contentEncoding = Constants.UTF_8; } BufferedReader br = null; try { br = new BufferedReader( new InputStreamReader(new FileInputStream(tempFile), contentEncoding)); final StringWriter writer4 = new StringWriter(initialBufferSize); String line; while ((line = br.readLine()) != null) { writer4.write(line.replaceAll("\\p{Cntrl}", " ").replaceAll("\\s+", " ").trim()); writer4.write(' '); } content = writer4.toString().trim(); } catch (final Exception e) { logger.warn("Could not read " + tempFile.getAbsolutePath(), e); } finally { IOUtils.closeQuietly(br); } } } final ExtractData extractData = new ExtractData(content); final String[] names = metadata.names(); Arrays.sort(names); for (final String name : names) { extractData.putValues(name, metadata.getValues(name)); } if (logger.isDebugEnabled()) { logger.debug("Result: metadata: {}", metadata); } return extractData; } catch (final TikaException e) { if (e.getMessage().indexOf("bomb") >= 0) { throw e; } final Throwable cause = e.getCause(); if (cause instanceof SAXException) { final Extractor xmlExtractor = robotContainer.getComponent("xmlExtractor"); if (xmlExtractor != null) { IOUtils.closeQuietly(in); in = new FileInputStream(tempFile); return xmlExtractor.getText(in, params); } } throw e; } finally { IOUtils.closeQuietly(in); if (originalOutStream != null) { System.setOut(originalOutStream); } if (originalErrStream != null) { System.setErr(originalErrStream); } try { if (logger.isInfoEnabled()) { final byte[] bs = outStream.toByteArray(); if (bs.length != 0) { logger.info(new String(bs, outputEncoding)); } } if (logger.isWarnEnabled()) { final byte[] bs = errStream.toByteArray(); if (bs.length != 0) { logger.warn(new String(bs, outputEncoding)); } } } catch (final Exception e) { // NOP } } } catch (final Exception e) { throw new ExtractException("Could not extract a content.", e); } finally { if (tempFile != null && !tempFile.delete()) { logger.warn("Failed to delete " + tempFile.getAbsolutePath()); } } } private String normalizeContent(final StringWriter writer) { return writer.toString().replaceAll("\\s+", " ").trim(); } String getPdfPassword(final String url, final String resourceName) { if (pdfPasswordMap.isEmpty()) { return null; } String value = null; if (StringUtil.isNotEmpty(url)) { value = url; } else if (StringUtil.isNotEmpty(resourceName)) { value = resourceName; } if (value != null) { for (final Map.Entry<String, String> entry : pdfPasswordMap.entrySet()) { if (value.matches(entry.getKey())) { return entry.getValue(); } } } return null; } private Metadata createMetadata(final String resourceName, final String contentType, final String contentEncoding, final String pdfPassword) { final Metadata metadata = new Metadata(); if (StringUtil.isNotEmpty(resourceName)) { metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName); } if (StringUtil.isNotBlank(contentType)) { metadata.set(HttpHeaders.CONTENT_TYPE, contentType); } if (StringUtil.isNotBlank(contentEncoding)) { metadata.set(HttpHeaders.CONTENT_ENCODING, contentEncoding); } if (pdfPassword != null) { metadata.add(ExtractData.PDF_PASSWORD, pdfPassword); } if (logger.isDebugEnabled()) { logger.debug("metadata: {}", metadata); } return metadata; } public void addPdfPassword(final String regex, final String password) { pdfPasswordMap.put(regex, password); } // workaround: Tika does not have extention points. protected class DetectParser extends CompositeParser { /** * */ private static final long serialVersionUID = 1L; /** * The type detector used by this parser to auto-detect the type of a * document. */ private final Detector detector; // always set in the constructor /** * Creates an auto-detecting parser instance using the default Tika * configuration. */ public DetectParser() { this(tikaConfig); } public DetectParser(final TikaConfig config) { super(config.getMediaTypeRegistry(), config.getParser()); detector = config.getDetector(); } @Override public void parse(final InputStream stream, final ContentHandler handler, final Metadata metadata, final ParseContext context) throws IOException, SAXException, TikaException { final TemporaryResources tmp = new TemporaryResources(); try { final TikaInputStream tis = TikaInputStream.get(stream, tmp); // Automatically detect the MIME type of the document final MediaType type = detector.detect(tis, metadata); metadata.set(HttpHeaders.CONTENT_TYPE, type.toString()); // TIKA-216: Zip bomb prevention final SecureContentHandler sch = new SecureContentHandler(handler, tis); sch.setMaximumCompressionRatio(maxCompressionRatio); sch.setOutputThreshold(maxUncompressionSize); if (logger.isDebugEnabled()) { logger.debug("type: {}, metadata: {}, maxCompressionRatio: {}, maxUncompressionSize: {}", type, metadata, maxCompressionRatio, maxUncompressionSize); } try { // Parse the document super.parse(tis, sch, metadata, context); } catch (final SAXException e) { // Convert zip bomb exceptions to TikaExceptions sch.throwIfCauseOf(e); throw e; } } finally { tmp.dispose(); } } @Override public void parse(final InputStream stream, final ContentHandler handler, final Metadata metadata) throws IOException, SAXException, TikaException { final ParseContext context = new ParseContext(); context.set(Parser.class, this); parse(stream, handler, metadata, context); } } }