nl.minbzk.dwr.zoeken.enricher.processor.TikaProcessor.java Source code

Introduction

Here is the source code for nl.minbzk.dwr.zoeken.enricher.processor.TikaProcessor.java
Source

/* Copyright (c) 2010 Ministry of the Interior and Kingdom Relations,
 * the Netherlands. All rights reserved.
 * 
 * This file is part of the MinBZK Search Enricher indexing generator.
 * 
 * Search Enricher is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * Search Enricher is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with Search Enricher. If not, see <http://www.gnu.org/licenses/>. */

package nl.minbzk.dwr.zoeken.enricher.processor;

import com.cybozu.labs.langdetect.DetectorFactory;
import com.cybozu.labs.langdetect.LangDetectException;
import com.cybozu.labs.langdetect.Language;
import com.mongodb.gridfs.GridFS;
import com.mongodb.gridfs.GridFSDBFile;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Vector;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import nl.minbzk.dwr.zoeken.enricher.Processor;
import nl.minbzk.dwr.zoeken.enricher.ProcessorResult;
import nl.minbzk.dwr.zoeken.enricher.ProcessorResult.ProcessorContent;
import nl.minbzk.dwr.zoeken.enricher.aci.ImportEnvelope;
import nl.minbzk.dwr.zoeken.enricher.processor.uima.UIMAUnit;
import nl.minbzk.dwr.zoeken.enricher.settings.EnricherJob;
import nl.minbzk.dwr.zoeken.enricher.settings.EnricherSettings;
import nl.minbzk.dwr.zoeken.enricher.util.XmlHtmlReader;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.tika.detect.CompositeDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.bson.types.ObjectId;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.mongodb.MongoDbFactory;

import javax.xml.transform.TransformerConfigurationException;

import static java.lang.String.format;

/**
 * Apache Tika based content processor.
 * 
 * @author Jasper van Veghel <j.veghel@rijksoverheid.nl>
 */
public class TikaProcessor implements Processor {
    /**
     * The logger.
     */
    private static final Logger logger = LoggerFactory.getLogger(TikaProcessor.class);

    /**
     * Constants.
     */
    private static final String DEFAULT_ENCODING = "UTF-8";
    private static final Map<String, List<String>> PLAUSIBLE_ENCODING_MAPPING = new HashMap<String, List<String>>();

    static {
        // Japanese

        PLAUSIBLE_ENCODING_MAPPING.put("ja",
                Arrays.asList(new String[] { "UTF-8", "UTF-16", "Shift_JIS", "ISO-2022-JP", "EUC-JP" }));

        // Chinese

        PLAUSIBLE_ENCODING_MAPPING.put("zh",
                Arrays.asList(new String[] { "UTF-8", "UTF-16", "ISO-2022-CN", "GB18030", "Big5" }));

        // Korean

        PLAUSIBLE_ENCODING_MAPPING.put("ko",
                Arrays.asList(new String[] { "UTF-8", "UTF-16", "ISO-2022-KR", "EUC-KR" }));

        // Russian and Bulgarian

        PLAUSIBLE_ENCODING_MAPPING.put("ru",
                Arrays.asList(new String[] { "UTF-8", "UTF-16", "ISO-8859-5", "KOI8-R", "windows-1251" }));
        PLAUSIBLE_ENCODING_MAPPING.put("bg",
                Arrays.asList(new String[] { "UTF-8", "UTF-16", "windows-1251", "KOI8-R" }));

        // Arabic

        PLAUSIBLE_ENCODING_MAPPING.put("ar",
                Arrays.asList(new String[] { "UTF-8", "UTF-16", "ISO-8859-6", "windows-1256" }));

        // Greek

        PLAUSIBLE_ENCODING_MAPPING.put("el", Arrays.asList(new String[] { "UTF-8", "UTF-16", "ISO-8859-7" }));

        // Hebrew

        PLAUSIBLE_ENCODING_MAPPING.put("he", Arrays.asList(new String[] { "UTF-8", "UTF-16", "ISO-8859-8" }));

        // Turkish

        PLAUSIBLE_ENCODING_MAPPING.put("tr", Arrays.asList(new String[] { "UTF-8", "UTF-16", "ISO-8859-9" }));
    }

    private static final Character CHARACTER_NBSP = 0xA0;

    private static final List<String> METADATA_IGNORE = Arrays
            .asList(new String[] { "Content-Type", "Content-Encoding", "Content-Language", "title" });

    /**
     * Language mapping.
     * 
     * NOTE: This is not complete. See ISO 639-2
     */
    private static enum LanguageMappingType {
        english("en"), dutch("nl"), french("fr"), german("de"), japanese("ja"), swedish("sv"), norwegian(
                "no"), danish("da"), spanish("es"), chinese("zh"), korean("ko"), persian("fa"), greek(
                        "el"), hebrew("he"), hindi("hi"), armenian("hy"), icelandic("is"), polish("pl"), slovenian(
                                "sl"), slovak("sk"), thai("th"), vietnamese(
                                        "vi"), italian("it"), arabic("ar"), turkish("tr"), tamil("ta");

        /**
         * IDX index indentifier.
         */
        public String language;

        /**
         * Default constructor
         * 
         * @param language
         */
        LanguageMappingType(final String language) {
            this.language = language;
        }
    }

    /**
     * Tika parse context.
     */
    private final ParseContext parseContext;

    /**
     * Tika parser.
     */
    private final AutoDetectParser parser;

    /**
     * MongoDB factory.
     */
    @Autowired
    private MongoDbFactory dbFactory;

    /**
     * GridFS reference (to be lazily initialized).
     */
    private GridFS gridFs = null;

    /**
     * Pre-processor mapping.
     */
    private Map<String, List<PreProcessor>> preprocessors;

    /**
     * UIMA units.
     */
    private final Map<String, UIMAUnit> uimaUnits = new ConcurrentHashMap<String, UIMAUnit>();

    /**
     * Default constructor. Use an auto detect parser.
     * 
     * @param settings
     * @throws TransformerConfigurationException
     * @throws ClassNotFoundException
     * @throws IllegalAccessException
     * @throws InstantiationException
     * @throws LangDetectException
     */
    @Autowired
    public TikaProcessor(final EnricherSettings settings) throws TransformerConfigurationException,
            InstantiationException, IllegalAccessException, ClassNotFoundException, LangDetectException {
        parseContext = new ParseContext();
        parser = new AutoDetectParser();

        if (settings.getLanguageDetectionProfiles() != null) {
            if (DetectorFactory.getLangList().size() == 0)
                DetectorFactory.loadProfile(settings.getLanguageDetectionProfiles());
        } else
            logger.warn(
                    "No language detection profiles were provided - language detection will be disabled throughout");

        // Add in any custom detectors

        if (settings.getTikaDetectors().size() > 0) {
            ArrayList<Detector> detectors = new ArrayList<Detector>();

            detectors.add(parser.getDetector());

            for (String detectorClass : settings.getTikaDetectors()) {
                if (logger.isDebugEnabled())
                    logger.info(format("Adding custom detector with class '%s'", detectorClass));

                detectors.add(
                        (Detector) Class.forName(detectorClass, true, getClass().getClassLoader()).newInstance());
            }

            // Combine the various detectors into a composite detector

            parser.setDetector(new CompositeDetector(detectors));
        }

        // Add in any custom parsers

        if (settings.getTikaParsers().size() > 0) {
            Map<MediaType, Parser> parsers = parser.getParsers();

            for (String parserClass : settings.getTikaParsers()) {
                if (logger.isDebugEnabled())
                    logger.info(format("Adding custom parser with class '%s'", parserClass));

                Parser customParser = (Parser) Class.forName(parserClass, true, getClass().getClassLoader())
                        .newInstance();

                // Add the parser for each supported type

                for (MediaType mediaType : customParser.getSupportedTypes(parseContext))
                    parsers.put(mediaType, customParser);
            }

            parser.setParsers(parsers);
        }

        // Initialize all job-associated UIMA descriptors

        for (Entry<String, EnricherJob> jobEntry : settings.getJobs().entrySet())
            if (jobEntry.getValue().getEntityDetectionDescriptors() != null) {
                List<String> languages = jobEntry.getValue().getLanguageDetectionSupported();

                if (jobEntry.getValue().getEntityDetectionLanguages() != null
                        && jobEntry.getValue().getEntityDetectionLanguages().size() > 0)
                    languages = jobEntry.getValue().getEntityDetectionLanguages();
                if (languages == null || languages.size() == 0)
                    languages = Arrays
                            .asList(new String[] { jobEntry.getValue().getLanguageDetectionDefault() != null
                                    ? jobEntry.getValue().getLanguageDetectionDefault()
                                    : "en" });

                for (String jobLanguage : languages)
                    try {
                        uimaUnits.put(jobEntry.getKey() + "-" + jobLanguage,
                                new UIMAUnit(jobEntry.getValue().getEntityDetectionDescriptors(), jobLanguage,
                                        settings.getLanguageAnalysisMaximumInstances(),
                                        settings.getLanguageAnalysisWaitingTimeout()));

                        logger.info("Adding UIMA unit '" + jobEntry.getKey() + "-" + jobLanguage + "' with "
                                + settings.getLanguageAnalysisMaximumInstances() + " instance(s) to unit list");
                    } catch (Exception e) {
                        logger.error("Could not initialize the UIMA unit for job entry '" + jobEntry.getKey()
                                + "' and language '" + jobLanguage + "'", e);
                    }
            }

        // Now set the context parser

        parseContext.set(Parser.class, parser);
    }

    /**
     * Do the actual content processing.
     * 
     * @param envelope
     * @param settings
     * @param job
     * @param context
     * @return ProcessorResult
     * @throws Exception
     */
    @Override
    public ProcessorResult process(final ImportEnvelope envelope, final EnricherSettings settings,
            final EnricherJob job, final ProcessorContext context) throws Exception {
        String inputUri = envelope.getUri();
        String envelopeEncoding = envelope.getFields().get(settings.getEncodingMatch()) != null
                ? envelope.getFields().get(settings.getEncodingMatch()).get(0)
                : null;

        // Output just the usage information if so requested

        if (inputUri == null) {
            if (logger.isInfoEnabled())
                logger.info("Given import envelope contains no content URI and thus no content - only metadata");
        } else {
            if (logger.isDebugEnabled())
                logger.debug("Given import envelope content URI is " + inputUri);
        }

        // Handle the input content

        StringWriter output = new StringWriter();
        Metadata metadata = new Metadata();
        MediaType mediaType = null;

        // Add the reference for optimal detection

        if (job.getTikaResourceKeyPriority() != null && job.getTikaResourceKeyPriority().size() > 0)
            for (String resourceKey : job.getTikaResourceKeyPriority())
                if (envelope.getFields().containsKey(resourceKey)) {
                    String resourceValue = envelope.getFields().get(resourceKey).get(0);

                    if (logger.isInfoEnabled())
                        logger.info(format("Setting Tika resource to key '%s' with value '%s'", resourceKey,
                                resourceValue));

                    metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceValue);

                    break;
                }

        if (metadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY) == null)
            metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, envelope.getReference());

        // Determine whether to read from standard input, a file or a URL

        if (inputUri != null) {
            File inputFile = new File(inputUri);

            InputStream inputStream = readInputUri(inputUri, inputFile);

            // Pre-process the input if necessary

            try {
                mediaType = parser.getDetector().detect(inputStream, metadata);

                if (logger.isDebugEnabled())
                    logger.debug("Parser detected " + mediaType.getType() + " / " + mediaType.getSubtype());

                // Detect the input encoding and introduce any optional encoding hints

                Charset inputEncoding = introduceEncodingHints(inputStream, metadata, envelopeEncoding,
                        envelope.getReferenceEncoding(),
                        job.getLanguageDetectionParameter() != null
                                ? envelope.getSingleValue(job.getLanguageDetectionParameter())
                                : null,
                        mediaType, job);

                // Pre-process the input stream, if needed

                for (Map.Entry<String, List<PreProcessor>> preprocessorsEntry : preprocessors.entrySet())
                    if (mediaType.getSubtype().contains(preprocessorsEntry.getKey()))
                        for (PreProcessor preprocessor : preprocessorsEntry.getValue()) {
                            inputStream = preprocessor.process(inputStream, inputEncoding.toString(), envelope,
                                    settings, job);

                            // Following the pre-processor contract, the stream encoding should have been transferred to UTF-8

                            inputEncoding = Charset.forName("UTF-8");
                            metadata.set(HttpHeaders.CONTENT_TYPE, mediaType.toString() + "; charset=UTF-8");
                        }

                parser.parse(inputStream, new BodyContentHandler(output), metadata, parseContext);
            } finally {
                inputStream.close();

                // Delete the file if requested to

                if (inputFile.isFile() && envelope.isDeleteOriginal())
                    inputFile.delete();
            }

            output.flush();
        }

        String outputContent = output.toString().trim();

        // Handle the result

        Map<String, List<String>> metadataResult = new LinkedHashMap<String, List<String>>();
        List<ProcessorContent> contentResult = new ArrayList<ProcessorContent>();

        for (String name : metadata.names())
            if (!METADATA_IGNORE.contains(name)) {
                List<String> values = new Vector<String>();

                values.add(metadata.get(name));

                metadataResult.put(name, values);
            }

        contentResult.addAll(applyWordBreaking(envelope.getReference(), outputContent, settings.getWordBreakMin(),
                settings.getWordBreakMax()));

        // Only take the first language, should multiple ones be present

        String detectedLanguage = null;

        if (job.getLanguageDetectionParameter() != null) {
            if (!StringUtils.isEmpty(metadata.get(HttpHeaders.CONTENT_LANGUAGE)))
                detectedLanguage = metadata.get(HttpHeaders.CONTENT_LANGUAGE).split(",")[0].trim().split("-")[0]
                        .trim();
            else {
                // Use an n-gram language profiler to detect the effective language, if it was not given or could be detected otherwise

                String categorizerLanguage = categorizeCybozu(settings, job, outputContent);

                if (!categorizerLanguage.equals("unknown"))
                    metadata.set(HttpHeaders.CONTENT_LANGUAGE, (detectedLanguage = categorizerLanguage));
                else
                    logger.warn(
                            "The language could not be detected using n-gram detection; leaving language detection empty");
            }
        }

        // Perform entity extraction using UIMA, if applicable

        if (job.getEntityDetectionDescriptors() != null) {
            // We can only do this if we know the language

            if (detectedLanguage != null) {
                // And if the language is supported

                if (job.getEntityDetectionLanguages() != null && job.getEntityDetectionLanguages().size() > 0
                        && !job.getEntityDetectionLanguages().contains(detectedLanguage))
                    logger.warn(
                            "Not performing UIMA processing as the document language is not covered by the UIMA detectors");
                else
                    try {
                        performInjection(job, context, contentResult, detectedLanguage, mediaType);
                    } catch (AnalysisEngineProcessException e) {
                        logger.error("Could not inject UIMA metadata due to an analysis engine process exception",
                                e);
                    }
            } else
                logger.warn("Not performing UIMA processing as no language was provided or detected");
        }

        String actualMediaType = "text/plain";

        if (mediaType != null)
            actualMediaType = mediaType.getType() + "/" + mediaType.getSubtype();

        return new ProcessorResult(metadataResult, contentResult, actualMediaType, detectedLanguage);
    }

    /**
     * Transformt the given input URI into an InputStream.
     *
     * @param inputUri
     * @param file
     * @return InputStream
     * @throws Exception
     */
    private InputStream readInputUri(final String inputUri, final File file) throws Exception {
        InputStream inputStream;

        if (file.isFile()) {
            inputStream = new FileInputStream(file);

            if (logger.isInfoEnabled())
                logger.info("Fetching given input file " + inputUri);
        } else {
            try {
                URI uri = new URI(inputUri);

                if (inputUri.startsWith("data://")) {
                    if (logger.isInfoEnabled())
                        logger.info("Fetching data URI with ID " + inputUri.substring(7));

                    inputStream = retrieveGridStream(inputUri.substring(7));
                } else {
                    if (logger.isInfoEnabled())
                        logger.info("Fetching given input URL " + inputUri);

                    inputStream = uri.toURL().openStream();
                }
            } catch (URISyntaxException e) {
                throw new Exception("The given input URI is neither an existing file nor a valid URL.");
            }
        }

        if (!inputStream.markSupported())
            inputStream = new BufferedInputStream(inputStream);
        return inputStream;
    }

    /**
     * Perform cybozu-based language detection.
     * 
     * @param settings
     * @param job
     * @param outputContent
     * @return String
     * @throws LangDetectException
     */
    public static String categorizeCybozu(final EnricherSettings settings, final EnricherJob job,
            final String outputContent) throws LangDetectException {
        com.cybozu.labs.langdetect.Detector detector = DetectorFactory.create();

        if (outputContent.length() >= 10) {
            if (settings.getLanguageAnalysisThreshold() != 0
                    && outputContent.length() > settings.getLanguageAnalysisThreshold())
                detector.append(outputContent.substring(0, settings.getLanguageAnalysisThreshold()));
            else
                detector.append(outputContent);

            if (job.getLanguageDetectionSupported() == null) {
                String language = detector.detect();

                if (language != null)
                    return language.substring(0, 2);
            } else {
                detector.detect();

                for (Language probableLanguage : detector.getProbabilities()) {
                    String language = probableLanguage.lang.substring(0, 2);

                    if (job.getLanguageDetectionSupported().contains(language))
                        return language;
                }
            }
        }

        return "unknown";
    }

    /**
     * Perform UIMA injection.
     * 
     * @param job
     * @param context
     * @param processorOutputs
     * @param detectedLanguage
     * @param mediaType
     * @throws AnalysisEngineProcessException
     */
    private void performInjection(final EnricherJob job, final ProcessorContext context,
            final List<ProcessorContent> processorOutputs, final String detectedLanguage, final MediaType mediaType)
            throws AnalysisEngineProcessException {
        UIMAUnit uimaUnit = uimaUnits.get(job.getName() + "-" + detectedLanguage);

        if (uimaUnit == null) {
            if (logger.isDebugEnabled())
                logger.debug(format("No UIMA unit found for job %s and detected language %s", job.getName(),
                        detectedLanguage));

            return;
        }

        UIMAInjector injector = new UIMAInjector(context, uimaUnit, job.getEntityDetectionScanTypes(),
                job.getEntityDetectionFieldPrefix(), job.getGeoSpatialFieldPrefix());

        for (ProcessorContent processorOutput : processorOutputs) {
            String documentOutput = processorOutput.getContent(), documentId = processorOutput.getContentId();

            if ((documentOutput = UIMAInjector.adjustForEligibility(documentOutput)) == null)
                continue;

            injector.inject(documentOutput, documentId, detectedLanguage, mediaType, processorOutput);
        }
    }

    /**
     * Introduce encoding hints into the metadata if possible, and return the detected (or default) stream encoding through the following process:
     * 
     * In case of text/html or application/xhtml+xml:
     * 
     * 1) Take the Content-Type as the mediaType + referenceEncoding, if it was given. Otherwise use the envelopeEncoding, if it was given. 2) Process it through the ROME-derived XmlReader, using DEFAULT_ENCODING as the fall-back default.
     * 
     * In case of anything else:
     * 
     * 1) Use the reference encoding if it was given. Otherwise use the envelopeEncoding, if it was given. 2) If no encoding hints were given, attempt to detect it using ICU4J's CharsetDetector and match it against a set of plausible encodings. 3) If no plausible encoding was found, return the DEFAULT_ENCODING as the fall-back default.
     * 
     * Additionally, if a language was given as an envelope field (specified in FetchSettings) we also add it to the metadata.
     * 
     * @param bufferedStream
     * @param metadata
     * @param envelopeEncoding
     * @param referenceEncoding
     * @param envelopeLanguage
     * @param mediaType
     * @param job
     * @return Charset
     * @throws IOException
     */
    public static Charset introduceEncodingHints(final InputStream bufferedStream, final Metadata metadata,
            final String envelopeEncoding, final String referenceEncoding, final String envelopeLanguage,
            final MediaType mediaType, final EnricherJob job) throws IOException {
        Charset inputEncoding = Charset.forName(DEFAULT_ENCODING);

        boolean isLanguageDetectionEnabled = job.getLanguageDetectionParameter() != null;

        // The encoding is either the reference encoding, or the language / envelope encoding if null

        String[] optionalLanguageEncoding = envelopeLanguage != null ? retrieveLanguageEncoding(envelopeLanguage)
                : envelopeEncoding != null ? retrieveLanguageEncoding(envelopeEncoding) : new String[] {};
        String preferenceEncoding = referenceEncoding != null ? referenceEncoding
                : optionalLanguageEncoding.length == 2 ? optionalLanguageEncoding[1] : null;

        // If language detection has been disabled, don't do anything;
        // If a language was derived from the language-encoding parameter (typically DRELANGUAGE) use it;
        // If a default language has been requested, prefer it;

        String language = null;

        if (isLanguageDetectionEnabled) {
            if (job.getLanguageDetectionDefault() != null)
                language = job.getLanguageDetectionDefault();
            else if (optionalLanguageEncoding.length == 2) {
                if (job.getLanguageDetectionSupported() != null
                        && !job.getLanguageDetectionSupported().contains(optionalLanguageEncoding[0]))
                    logger.warn("The envelope or encoding-hint derived language (" + optionalLanguageEncoding[0]
                            + ") does not fall within the list of supported languages for this job - resorting to META and n-gram detection");
                else
                    language = optionalLanguageEncoding[0];
            }
        }

        if (mediaType != null
                && (mediaType.getSubtype().startsWith("html") || mediaType.getSubtype().startsWith("xhtml"))) {
            XmlHtmlReader reader = new XmlHtmlReader(bufferedStream,
                    mediaType + (preferenceEncoding != null ? "; charset=" + preferenceEncoding : ""), true,
                    DEFAULT_ENCODING);

            // Always reset the stream, as XmlHtmlReader has already consumed some bytes

            try {
                bufferedStream.reset();
            } catch (IOException e) {
                // Do nothing
            }

            // At the cost of which, we can now derive the encoding

            inputEncoding = Charset.forName(reader.getEncoding());

            // If the language encoding wasn't specified within the envelope, it might have been specified in the META tags

            if (isLanguageDetectionEnabled && job.getLanguageDetectionDefault() == null && language == null
                    && reader.getLanguages() != null && reader.getLanguages().size() > 0) {
                if (job.getLanguageDetectionSupported() != null
                        && !job.getLanguageDetectionSupported().contains(reader.getLanguages().get(0)))
                    logger.warn("The detected META language (" + reader.getLanguages().get(0)
                            + ") does not fall within the list of supported languages for this job - resorting to n-gram detection");
                else
                    language = reader.getLanguages().get(0);
            }
        } else {
            if (preferenceEncoding != null)
                inputEncoding = Charset.forName(preferenceEncoding);
            else {
                try {
                    CharsetDetector detector = new CharsetDetector().setText(bufferedStream);

                    boolean isPlausible = false;
                    CharsetMatch[] matches = detector.detectAll();

                    if (language != null && PLAUSIBLE_ENCODING_MAPPING.containsKey(language))
                        for (CharsetMatch match : matches) {
                            String encodingName = match.getName();

                            // Give preference to certain character sets based on a possibly given language

                            if (PLAUSIBLE_ENCODING_MAPPING.get(language).contains(encodingName)) {
                                inputEncoding = Charset.forName(encodingName);

                                isPlausible = true;

                                break;
                            }
                        }
                    else if (job.getLanguageDetectionParameter() != null
                            && job.getLanguageDetectionSupported() != null
                            && job.getLanguageDetectionSupported().size() > 0) {
                        // Detract all non-plausible encodings

                        List<String> nonPlausibleEncodings = new ArrayList<String>();

                        for (Map.Entry<String, List<String>> entry : PLAUSIBLE_ENCODING_MAPPING.entrySet())
                            if (!job.getLanguageDetectionSupported().contains(entry.getKey()))
                                nonPlausibleEncodings.addAll(entry.getValue());

                        nonPlausibleEncodings.remove("UTF-8");
                        nonPlausibleEncodings.remove("UTF-16");

                        // Then detect against those

                        for (CharsetMatch match : matches) {
                            String encodingName = match.getName();

                            if (!nonPlausibleEncodings.contains(encodingName)) {
                                inputEncoding = Charset.forName(encodingName);

                                isPlausible = true;

                                break;
                            }
                        }
                    }

                    // Use the first encoding from the matches if no plausible one was found

                    if (!isPlausible && matches.length > 0)
                        inputEncoding = Charset.forName(matches[0].getName());
                } catch (Exception e) {
                    // Use the default encoding
                }

                bufferedStream.reset();
            }
        }

        // Add the Content-Type to the metadata

        metadata.set(HttpHeaders.CONTENT_TYPE,
                mediaType + (inputEncoding != null ? "; charset=" + inputEncoding : ""));
        metadata.set(HttpHeaders.CONTENT_ENCODING, inputEncoding.toString());

        // Add the Content-Language to the metadata

        if (language != null)
            try {
                LanguageMappingType languageMapping = LanguageMappingType.valueOf(language);

                metadata.set(HttpHeaders.CONTENT_LANGUAGE, languageMapping.language);
            } catch (IllegalArgumentException e) {
                metadata.set(HttpHeaders.CONTENT_LANGUAGE, language);
            }

        if (logger.isDebugEnabled())
            logger.debug("Given import envelope or reference encoding is " + preferenceEncoding
                    + " - stream is thought to contain " + inputEncoding
                    + (language != null ? " - using original language hint " + language : ""));

        return inputEncoding;
    }

    /**
     * Apply word breaking to the given content.
     * 
     * @param content
     * @param wordBreakMin
     * @param wordBreakMax
     * @return List<ProcessorContent>
     */
    public static List<ProcessorContent> applyWordBreaking(final String envelopeReference, final String content,
            final Integer wordBreakMin, final Integer wordBreakMax) {
        List<ProcessorContent> contentResult = new ArrayList<ProcessorContent>();
        String strippedContent = stripContent(content);

        // Note that paragraph (\n\n) breaking is favored over between-sentence breaking by excluding ? and ! sentence endings

        if (wordBreakMin > 0 && wordBreakMax > 0) {
            String contentSection = "";

            String[] splitTokens = strippedContent.split(" ");

            for (int tokens = 0, i = 0; i < splitTokens.length; tokens++, i++) {
                String token = splitTokens[i], nextToken = i != splitTokens.length - 1 ? splitTokens[i + 1] : null;

                contentSection += token + (nextToken != null ? " " : "");

                // Break on the first dotted line ending or double newline following the lower word limit

                if (tokens >= wordBreakMin && isTokenBreaksSegment(token, nextToken) || tokens >= wordBreakMax) {
                    // In case of a line or paragraph split, move the content forward

                    String[] splitContent = token.split("\n", 2);

                    if (splitContent.length > 1) {
                        contentResult.add(new ProcessorContent(contentSection
                                .substring(0, contentSection.length() - splitContent[1].length()).trim(),
                                contentResult.size() + " - " + envelopeReference));

                        contentSection = splitContent[1].trim() + " ";
                    } else {
                        contentResult.add(new ProcessorContent(contentSection.trim(),
                                contentResult.size() + " - " + envelopeReference));

                        contentSection = "";
                    }

                    tokens = 0;
                }
            }

            if (!StringUtils.isEmpty(contentSection))
                contentResult.add(new ProcessorContent(contentSection.trim(),
                        contentResult.size() + " - " + envelopeReference));
            else
                logger.warn("Given content has been emptied by prior enrichment steps - not offering as a result");
        } else
            contentResult
                    .add(new ProcessorContent(strippedContent, contentResult.size() + " - " + envelopeReference));

        return contentResult;
    }

    /**
     * Strip the output of superfluous whitespace and newline characters.
     * 
     * @param content
     * @return String
     */
    public static String stripContent(final String content) {
        StringBuilder result = new StringBuilder();

        // Collapse whitespace and newlines, replacing non-breaking spaces

        String[] splitContent = content.replace(CHARACTER_NBSP, ' ').split("\n");
        Integer emptyLines = 0;

        for (int i = 0; i < splitContent.length; i++) {
            String line = splitContent[i].trim();

            if (!line.equals("")) {
                result.append(Pattern.compile("[\t ]+").matcher(line).replaceAll(" ") + "\n");

                emptyLines = 0;
            } else if (emptyLines < 1 && i != 0) {
                result.append("\n");

                emptyLines++;
            }
        }

        return result.toString().trim();
    }

    /**
     * Determine whether a given token breaks a line or paragraph segment.
     * 
     * @param token
     * @param nextToken
     * @return boolean
     */
    private static boolean isTokenBreaksSegment(final String token, final String nextToken) {
        boolean endOfLine = token.endsWith(".");
        boolean endOfParagraph = token.contains("\n") || token.contains("\n\n");
        boolean nextTokenStartsSentence = StringUtils.isEmpty(nextToken) || endOfParagraph
                || Character.isUpperCase(nextToken.charAt(0));

        return (endOfLine || endOfParagraph) && nextTokenStartsSentence;
    }

    /**
     * Determine the character set from a given language-charset combination. The result will be [ language, encoding ] where encoding can be null.
     * 
     * @param value
     * @return String[]
     */
    private static String[] retrieveLanguageEncoding(final String value) {
        // Strip any quotes

        String languageEncoding = value.replaceAll("\"", "");

        // Then split it up based on character encoding

        String encoding = null, language = languageEncoding;

        for (int i = 0; i < languageEncoding.length(); i++)
            if (Character.isUpperCase(languageEncoding.charAt(i))) {
                encoding = languageEncoding.substring(i);

                // This should never happen - but it could

                if (i > 0)
                    language = languageEncoding.substring(0, i);
                else
                    language = DEFAULT_ENCODING;

                break;
            }

        return new String[] { language != null ? language.split("-")[0].trim() : null,
                encoding != null ? encoding.toUpperCase() : null };
    }

    /**
     * Retrieve a grid stream from GridFS.
     *
     * @param id
     * @return InputStream
     */
    private InputStream retrieveGridStream(final String id) {
        if (this.gridFs == null)
            this.gridFs = new GridFS(dbFactory.getDb());

        GridFSDBFile file = gridFs.find(new ObjectId(id));

        return file.getInputStream();
    }

    /**
     * Return the preprocessors.
     * 
     * @return <code>Map<String, List<PreProcessor>></code>
     */
    @Override
    public Map<String, List<PreProcessor>> getPreprocessors() {
        return preprocessors;
    }

    /**
     * Set the preprocessors.
     * 
     * @param preprocessors
     */
    @Override
    public void setPreprocessors(final Map<String, List<PreProcessor>> preprocessors) {
        this.preprocessors = preprocessors;
    }

    /**
     * Retrieve the generated UIMA units.
     * 
     * @return Map<String, UIMAUnit>
     */
    public Map<String, UIMAUnit> getUIMAUnits() {
        return uimaUnits;
    }

    /**
     * Create the context.
     * 
     * @return ProcessorContext
     */
    @Override
    public ProcessorContext createContext() {
        ProcessorContext context = new ProcessorContext();

        UIMAInjector.initialize(context);

        return context;
    }

    /**
     * Reset the context.
     * 
     * @param context
     */
    @Override
    public void resetContext(final ProcessorContext context) {
        // XXX: Content resets are already done after each document processing
    }

    /**
     * Release the context.
     * 
     * @param context
     */
    @Override
    public void releaseContext(final ProcessorContext context) {
        UIMAInjector.release(context);
    }
}