org.codelibs.fess.crawler.extractor.impl.TikaExtractor.java Source code

Introduction

Here is the source code for org.codelibs.fess.crawler.extractor.impl.TikaExtractor.java
Source

/*
 * Copyright 2012-2016 CodeLibs Project and the Others.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 * either express or implied. See the License for the specific language
 * governing permissions and limitations under the License.
 */
package org.codelibs.fess.crawler.extractor.impl;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.io.Reader;
import java.io.Writer;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

import javax.annotation.PostConstruct;
import javax.annotation.Resource;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.ByteArrayOutputStream;
import org.apache.commons.io.output.DeferredFileOutputStream;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.SecureContentHandler;
import org.codelibs.core.io.CopyUtil;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.fess.crawler.Constants;
import org.codelibs.fess.crawler.container.CrawlerContainer;
import org.codelibs.fess.crawler.entity.ExtractData;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.exception.ExtractException;
import org.codelibs.fess.crawler.extractor.Extractor;
import org.codelibs.fess.crawler.util.TextUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

/**
 * @author shinsuke
 *
 */
public class TikaExtractor implements Extractor {
    private static final Logger logger = LoggerFactory.getLogger(TikaExtractor.class);

    @Resource
    protected CrawlerContainer crawlerContainer;

    public String outputEncoding = Constants.UTF_8;

    public boolean readAsTextIfFailed = true;

    public long maxCompressionRatio = 100;

    public long maxUncompressionSize = 1000000;

    public int initialBufferSize = 10000;

    public boolean replaceDuplication = false;

    public int memorySize = 1024 * 1024; //1mb

    public int maxAlphanumTermSize = -1;

    public int maxSymbolTermSize = -1;

    public TikaConfig tikaConfig;

    protected Map<String, String> pdfPasswordMap = new HashMap<String, String>();

    @PostConstruct
    public void init() {
        if (tikaConfig == null) {
            tikaConfig = TikaConfig.getDefaultConfig();
        }

        if (logger.isDebugEnabled()) {
            final Parser parser = tikaConfig.getParser();
            logger.debug("supportedTypes: {}", parser.getSupportedTypes(new ParseContext()));
        }
    }

    @Override
    public ExtractData getText(final InputStream inputStream, final Map<String, String> params) {
        if (inputStream == null) {
            throw new CrawlerSystemException("The inputstream is null.");
        }

        final File tempFile;
        final boolean isByteStream = inputStream instanceof ByteArrayInputStream;
        if (isByteStream) {
            inputStream.mark(0);
            tempFile = null;
        } else {
            try {
                tempFile = File.createTempFile("tikaExtractor-", ".out");
            } catch (final IOException e) {
                throw new ExtractException("Could not create a temp file.", e);
            }
        }

        try {
            final PrintStream originalOutStream = System.out;
            final ByteArrayOutputStream outStream = new ByteArrayOutputStream();
            System.setOut(new PrintStream(outStream, true));
            final PrintStream originalErrStream = System.err;
            final ByteArrayOutputStream errStream = new ByteArrayOutputStream();
            System.setErr(new PrintStream(errStream, true));
            try {
                final String resourceName = params == null ? null : params.get(TikaMetadataKeys.RESOURCE_NAME_KEY);
                final String contentType = params == null ? null : params.get(HttpHeaders.CONTENT_TYPE);
                String contentEncoding = params == null ? null : params.get(HttpHeaders.CONTENT_ENCODING);

                // password for pdf
                String pdfPassword = params == null ? null : params.get(ExtractData.PDF_PASSWORD);
                if (pdfPassword == null && params != null) {
                    pdfPassword = getPdfPassword(params.get(ExtractData.URL), resourceName);
                }

                final Metadata metadata = createMetadata(resourceName, contentType, contentEncoding, pdfPassword);

                final Parser parser = new DetectParser();
                final ParseContext parseContext = new ParseContext();
                parseContext.set(Parser.class, parser);

                String content = getContent(writer -> {
                    InputStream in = null;
                    try {
                        if (!isByteStream) {
                            try (OutputStream out = new FileOutputStream(tempFile)) {
                                CopyUtil.copy(inputStream, out);
                            }
                            in = new FileInputStream(tempFile);
                        } else {
                            in = inputStream;
                        }
                        parser.parse(in, new BodyContentHandler(writer), metadata, parseContext);
                    } finally {
                        IOUtils.closeQuietly(in);
                    }
                }, contentEncoding);
                if (StringUtil.isBlank(content)) {
                    if (resourceName != null) {
                        if (logger.isDebugEnabled()) {
                            logger.debug("retry without a resource name: {}", resourceName);
                        }
                        final Metadata metadata2 = createMetadata(null, contentType, contentEncoding, pdfPassword);
                        content = getContent(writer -> {
                            InputStream in = null;
                            try {
                                if (isByteStream) {
                                    inputStream.reset();
                                    in = inputStream;
                                } else {
                                    in = new FileInputStream(tempFile);
                                }
                                parser.parse(in, new BodyContentHandler(writer), metadata2, parseContext);
                            } finally {
                                IOUtils.closeQuietly(in);
                            }
                        }, contentEncoding);
                    }
                    if (StringUtil.isBlank(content) && contentType != null) {
                        if (logger.isDebugEnabled()) {
                            logger.debug("retry without a content type: {}", contentType);
                        }
                        final Metadata metadata3 = createMetadata(null, null, contentEncoding, pdfPassword);
                        content = getContent(writer -> {
                            InputStream in = null;
                            try {
                                if (isByteStream) {
                                    inputStream.reset();
                                    in = inputStream;
                                } else {
                                    in = new FileInputStream(tempFile);
                                }
                                parser.parse(in, new BodyContentHandler(writer), metadata3, parseContext);
                            } finally {
                                IOUtils.closeQuietly(in);
                            }
                        }, contentEncoding);
                    }

                    if (readAsTextIfFailed && StringUtil.isBlank(content)) {
                        if (logger.isDebugEnabled()) {
                            logger.debug("read the content as a text.");
                        }
                        if (contentEncoding == null) {
                            contentEncoding = Constants.UTF_8;
                        }
                        final String enc = contentEncoding;
                        content = getContent(writer -> {
                            BufferedReader br = null;
                            try {
                                if (isByteStream) {
                                    inputStream.reset();
                                    br = new BufferedReader(new InputStreamReader(inputStream, enc));
                                } else {
                                    br = new BufferedReader(
                                            new InputStreamReader(new FileInputStream(tempFile), enc));
                                }
                                String line;
                                while ((line = br.readLine()) != null) {
                                    writer.write(line);
                                }
                            } catch (final Exception e) {
                                logger.warn(
                                        "Could not read "
                                                + (tempFile != null ? tempFile.getAbsolutePath() : "a byte stream"),
                                        e);
                            } finally {
                                IOUtils.closeQuietly(br);
                            }
                        }, contentEncoding);
                    }
                }
                final ExtractData extractData = new ExtractData(content);

                final String[] names = metadata.names();
                Arrays.sort(names);
                for (final String name : names) {
                    extractData.putValues(name, metadata.getValues(name));
                }

                if (logger.isDebugEnabled()) {
                    logger.debug("Result: metadata: {}", metadata);
                }

                return extractData;
            } catch (final TikaException e) {
                if (e.getMessage().indexOf("bomb") >= 0) {
                    throw e;
                }
                final Throwable cause = e.getCause();
                if (cause instanceof SAXException) {
                    final Extractor xmlExtractor = crawlerContainer.getComponent("xmlExtractor");
                    if (xmlExtractor != null) {
                        InputStream in = null;
                        try {
                            if (isByteStream) {
                                inputStream.reset();
                                in = inputStream;
                            } else {
                                in = new FileInputStream(tempFile);
                            }
                            return xmlExtractor.getText(in, params);
                        } finally {
                            IOUtils.closeQuietly(in);
                        }
                    }
                }
                throw e;
            } finally {
                if (originalOutStream != null) {
                    System.setOut(originalOutStream);
                }
                if (originalErrStream != null) {
                    System.setErr(originalErrStream);
                }
                try {
                    if (logger.isInfoEnabled()) {
                        final byte[] bs = outStream.toByteArray();
                        if (bs.length != 0) {
                            logger.info(new String(bs, outputEncoding));
                        }
                    }
                    if (logger.isWarnEnabled()) {
                        final byte[] bs = errStream.toByteArray();
                        if (bs.length != 0) {
                            logger.warn(new String(bs, outputEncoding));
                        }
                    }
                } catch (final Exception e) {
                    // NOP
                }
            }
        } catch (final Exception e) {
            throw new ExtractException("Could not extract a content.", e);
        } finally {
            if (tempFile != null && !tempFile.delete()) {
                logger.warn("Failed to delete " + tempFile.getAbsolutePath());
            }
        }
    }

    protected InputStream getContentStream(DeferredFileOutputStream dfos) throws IOException {
        if (dfos.isInMemory()) {
            return new ByteArrayInputStream(dfos.getData());
        } else {
            return new BufferedInputStream(new FileInputStream(dfos.getFile()));
        }
    }

    protected String getContent(final ContentWriter out, String encoding) throws TikaException {
        File tempFile = null;
        try {
            tempFile = File.createTempFile("tika", ".tmp");
        } catch (IOException e) {
            throw new CrawlerSystemException("Failed to create a temp file.", e);
        }

        final String enc = encoding == null ? Constants.UTF_8 : encoding;
        try (DeferredFileOutputStream dfos = new DeferredFileOutputStream(memorySize, tempFile)) {
            BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(dfos, enc));
            out.accept(writer);
            writer.flush();

            try (Reader reader = new InputStreamReader(getContentStream(dfos), enc)) {
                return TextUtil.normalizeText(reader, initialBufferSize, maxAlphanumTermSize, maxSymbolTermSize,
                        replaceDuplication);
            }
        } catch (TikaException e) {
            throw e;
        } catch (Exception e) {
            throw new ExtractException("Failed to read a content.", e);
        } finally {
            if (tempFile.exists() && !tempFile.delete()) {
                logger.warn("Failed to delete " + tempFile.getAbsolutePath());
            }
        }
    }

    String getPdfPassword(final String url, final String resourceName) {
        if (pdfPasswordMap.isEmpty()) {
            return null;
        }

        String value = null;
        if (StringUtil.isNotEmpty(url)) {
            value = url;
        } else if (StringUtil.isNotEmpty(resourceName)) {
            value = resourceName;
        }

        if (value != null) {
            for (final Map.Entry<String, String> entry : pdfPasswordMap.entrySet()) {
                if (value.matches(entry.getKey())) {
                    return entry.getValue();
                }
            }
        }

        return null;
    }

    private Metadata createMetadata(final String resourceName, final String contentType,
            final String contentEncoding, final String pdfPassword) {
        final Metadata metadata = new Metadata();
        if (StringUtil.isNotEmpty(resourceName)) {
            metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
        }
        if (StringUtil.isNotBlank(contentType)) {
            metadata.set(HttpHeaders.CONTENT_TYPE, contentType);
        }
        if (StringUtil.isNotBlank(contentEncoding)) {
            metadata.set(HttpHeaders.CONTENT_ENCODING, contentEncoding);
        }
        if (pdfPassword != null) {
            metadata.add(ExtractData.PDF_PASSWORD, pdfPassword);
        }

        if (logger.isDebugEnabled()) {
            logger.debug("metadata: {}", metadata);
        }

        return metadata;
    }

    public void addPdfPassword(final String regex, final String password) {
        pdfPasswordMap.put(regex, password);
    }

    // workaround: Tika does not have extention points.
    protected class DetectParser extends CompositeParser {
        private static final long serialVersionUID = 1L;

        /**
         * The type detector used by this parser to auto-detect the type of a
         * document.
         */
        private final Detector detector; // always set in the constructor

        /**
         * Creates an auto-detecting parser instance using the default Tika
         * configuration.
         */
        public DetectParser() {
            this(tikaConfig);
        }

        public DetectParser(final TikaConfig config) {
            super(config.getMediaTypeRegistry(), config.getParser());
            detector = config.getDetector();
        }

        @Override
        public void parse(final InputStream stream, final ContentHandler handler, final Metadata metadata,
                final ParseContext context) throws IOException, SAXException, TikaException {
            final TemporaryResources tmp = new TemporaryResources();
            try {
                final TikaInputStream tis = TikaInputStream.get(stream, tmp);

                // Automatically detect the MIME type of the document
                final MediaType type = detector.detect(tis, metadata);
                metadata.set(HttpHeaders.CONTENT_TYPE, type.toString());

                // TIKA-216: Zip bomb prevention
                final SecureContentHandler sch = new SecureContentHandler(handler, tis);

                sch.setMaximumCompressionRatio(maxCompressionRatio);
                sch.setOutputThreshold(maxUncompressionSize);

                if (logger.isDebugEnabled()) {
                    logger.debug("type: {}, metadata: {}, maxCompressionRatio: {}, maxUncompressionSize: {}", type,
                            metadata, maxCompressionRatio, maxUncompressionSize);

                }

                try {
                    // Parse the document
                    super.parse(tis, sch, metadata, context);
                } catch (final SAXException e) {
                    // Convert zip bomb exceptions to TikaExceptions
                    sch.throwIfCauseOf(e);
                    throw e;
                }
            } finally {
                tmp.dispose();
            }
        }

        @Override
        public void parse(final InputStream stream, final ContentHandler handler, final Metadata metadata)
                throws IOException, SAXException, TikaException {
            final ParseContext context = new ParseContext();
            context.set(Parser.class, this);
            parse(stream, handler, metadata, context);
        }

    }

    @FunctionalInterface
    protected interface ContentWriter {
        void accept(Writer writer) throws Exception;
    }
}