org.codelibs.fess.crawler.extractor.impl.AbstractXmlExtractor.java Source code

Introduction

Here is the source code for org.codelibs.fess.crawler.extractor.impl.AbstractXmlExtractor.java
Source

/*
 * Copyright 2012-2016 CodeLibs Project and the Others.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 * either express or implied. See the License for the specific language
 * governing permissions and limitations under the License.
 */
package org.codelibs.fess.crawler.extractor.impl;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.text.translate.AggregateTranslator;
import org.apache.commons.lang3.text.translate.CharSequenceTranslator;
import org.apache.commons.lang3.text.translate.EntityArrays;
import org.apache.commons.lang3.text.translate.LookupTranslator;
import org.apache.commons.lang3.text.translate.NumericEntityUnescaper;
import org.codelibs.core.io.InputStreamUtil;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.fess.crawler.Constants;
import org.codelibs.fess.crawler.entity.ExtractData;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.exception.ExtractException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * @author shinsuke
 *
 */
public abstract class AbstractXmlExtractor {

    protected static final Logger logger = LoggerFactory.getLogger(AbstractXmlExtractor.class);

    protected static final CharSequenceTranslator UNESCAPE_HTML4 = new AggregateTranslator(
            new LookupTranslator(EntityArrays.BASIC_UNESCAPE()),
            new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE()),
            new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE()), new NumericEntityUnescaper());

    protected String encoding = Constants.UTF_8;

    protected int preloadSizeForCharset = 2048;

    protected boolean ignoreCommentTag = false;

    protected abstract Pattern getEncodingPattern();

    protected abstract Pattern getTagPattern();

    public ExtractData getText(final InputStream in, final Map<String, String> params) {
        if (in == null) {
            throw new CrawlerSystemException("The inputstream is null.");
        }
        try {
            final BufferedInputStream bis = new BufferedInputStream(in);
            final String enc = getEncoding(bis);
            final String content = UNESCAPE_HTML4.translate(new String(InputStreamUtil.getBytes(bis), enc));
            return new ExtractData(extractString(content));
        } catch (final Exception e) {
            throw new ExtractException(e);
        }
    }

    protected String getEncoding(final BufferedInputStream bis) {
        final byte[] b = new byte[preloadSizeForCharset];
        try {
            bis.mark(preloadSizeForCharset);
            final int c = bis.read(b);

            if (c == -1) {
                return encoding;
            }

            final String head = new String(b, 0, c, encoding);
            if (StringUtil.isBlank(head)) {
                return encoding;
            }
            final Matcher matcher = getEncodingPattern().matcher(head);
            if (matcher.find()) {
                final String enc = matcher.group(1);
                if (Charset.isSupported(enc)) {
                    return enc;
                }
            }
        } catch (final Exception e) {
            if (logger.isInfoEnabled()) {
                logger.info("Use a default encoding: " + encoding, e);
            }
        } finally {
            try {
                bis.reset();
            } catch (final IOException e) {
                throw new ExtractException(e);
            }
        }

        return encoding;
    }

    protected String extractString(final String content) {
        String input = content.replaceAll("[\\r\\n]", " ");
        if (ignoreCommentTag) {
            input = input.replaceAll("<!--[^>]+-->", "");
        } else {
            input = input.replace("<!--", "").replace("-->", "");
        }
        final Matcher matcher = getTagPattern().matcher(input);
        final StringBuffer sb = new StringBuffer();
        final Pattern attrPattern = Pattern.compile("\\s[^ ]+=\"([^\"]*)\"");
        while (matcher.find()) {
            final String tagStr = matcher.group();
            final Matcher attrMatcher = attrPattern.matcher(tagStr);
            final StringBuilder buf = new StringBuilder(100);
            while (attrMatcher.find()) {
                buf.append(attrMatcher.group(1)).append(' ');
            }
            matcher.appendReplacement(sb, buf.toString().replace("\\", "\\\\").replace("$", "\\$"));
        }
        matcher.appendTail(sb);
        return sb.toString().replaceAll("\\s+", " ").trim();
    }

    public String getEncoding() {
        return encoding;
    }

    public void setEncoding(final String encoding) {
        this.encoding = encoding;
    }

    /**
     * @return Returns the preloadSizeForCharset.
     */
    public int getPreloadSizeForCharset() {
        return preloadSizeForCharset;
    }

    /**
     * @param preloadSizeForCharset
     *            The preloadSizeForCharset to set.
     */
    public void setPreloadSizeForCharset(final int preloadSizeForCharset) {
        this.preloadSizeForCharset = preloadSizeForCharset;
    }

    public boolean isIgnoreCommentTag() {
        return ignoreCommentTag;
    }

    public void setIgnoreCommentTag(final boolean ignoreCommentTag) {
        this.ignoreCommentTag = ignoreCommentTag;
    }

}