com.seajas.search.contender.service.modifier.SourceElementModifierService.java Source code

Java tutorial

Introduction

Here is the source code for com.seajas.search.contender.service.modifier.SourceElementModifierService.java

Source

/**
 * Copyright (C) 2013 Seajas, the Netherlands.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 3, as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package com.seajas.search.contender.service.modifier;

import com.seajas.search.bridge.profiler.model.modifier.Modifier;
import com.seajas.search.bridge.profiler.model.modifier.ModifierFilter;
import com.seajas.search.bridge.profiler.model.modifier.ModifierScript;
import com.seajas.search.contender.WebResolverSettings;
import com.seajas.search.contender.http.SizeRestrictedHttpResponse;
import com.seajas.search.contender.http.SizeRestrictedResponseHandler;
import com.seajas.search.contender.replication.ModifierCache;
import com.seajas.search.contender.scripting.XmlHtmlReader;
import com.sun.syndication.feed.synd.SyndEntry;
import com.sun.syndication.feed.synd.SyndFeed;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.http.Header;
import org.apache.http.HttpHost;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.message.BasicHeader;
import org.apache.http.params.CoreProtocolPNames;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HttpContext;
import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.springframework.util.StringUtils;

import javax.script.ScriptException;

/**
 * Modifier service for source elements.
 * 
 * @author Jasper van Veghel <jasper@seajas.com>
 */
@Service
public class SourceElementModifierService extends AbstractModifierService {
    /**
     * The logger.
     */
    private static final Logger logger = LoggerFactory.getLogger(SourceElementModifierService.class);

    /**
     * The retrieval HTTP client.
     */
    @Autowired
    @Qualifier("retrievalHttpClient")
    private HttpClient httpClient;

    /**
     * The modifier filter processor.
     */
    @Autowired
    private ModifierFilterProcessor modifierFilterProcessor;

    /**
     * The modifier script processor.
     */
    @Autowired
    private ModifierScriptProcessor modifierScriptProcessor;

    /**
     * The modifier cache.
     */
    @Autowired
    private ModifierCache modifierCache;

    /**
     * The auto-detect parser.
     */
    @Autowired
    private AutoDetectParser autoDetectParser;

    /**
     * Default constructor.
     */
    public SourceElementModifierService() {
        super();
    }

    /**
     * Default constructor.
     * 
     * @param maximumContentLength
     * @param preferredEnclosures
     */
    @Autowired
    public SourceElementModifierService(
            @Value("${contender.project.http.maximum.result.size}") final Long maximumContentLength,
            @Value("${contender.project.rss.reader.preferred.enclosures}") final String preferredEnclosures) {
        super(maximumContentLength, preferredEnclosures);
    }

    /**
     * Return a relevant result URI from the given feed.
     * 
     * @param uri
     * @param feed
     * @return URL
     */
    public URI getResultUri(final URI uri, final SyndFeed feed) {
        logger.info("Retrieving result URI from modifier feed URI " + uri);

        // Return the first URL to test against

        if (feed != null && feed.getEntries().size() > 0)
            for (SyndEntry entry : (Collection<SyndEntry>) feed.getEntries())
                try {
                    String protocol = new URL(entry.getLink()).getProtocol();

                    if (uri.getScheme().equalsIgnoreCase("file") && protocol.equalsIgnoreCase("file")
                            || protocol.equalsIgnoreCase("http") || protocol.equalsIgnoreCase("https"))
                        try {
                            return new URI(getEntryLink(entry));
                        } catch (URISyntaxException e) {
                            logger.error("Unable to resolve URI", e);
                        }
                } catch (MalformedURLException e) {
                    logger.error("The given URL to test with is invalid", e);
                }

        return null;
    }

    /**
     * Test a given result modifier chain.
     * 
     * @param modifierId
     * @param uri
     * @param encodingOverride
     * @param userAgent
     * @throws Exception
     * @return Map<String, Boolean>
     */
    public Map<String, Boolean> testModifier(Integer modifierId, URI uri, String encodingOverride, String userAgent)
            throws Exception {
        WebResolverSettings settings = new WebResolverSettings();
        settings.setMaximumContentLength(maximumContentLength);
        settings.setUserAgent(userAgent);

        Map<String, Boolean> result = new HashMap<String, Boolean>();

        logger.info("Testing result modifier with ID " + modifierId + " and URI " + uri);

        Modifier modifier = modifierCache.getResultModifierById(modifierId);

        // Retrieve the content

        Content contentResult = getContent(uri, encodingOverride, null, userAgent);

        // Verify the expression in the same way the finalUrl is contrasted to the initial URL

        if (contentResult == null)
            throw new Exception("The given modifier content retrieval for testing purposes failed");

        if (contentResult.getUri() != null && !contentResult.getUri().equals(uri)) {
            if (logger.isInfoEnabled())
                logger.info("Modifier result has different final (post-redirect) URL from original: "
                        + contentResult.getUri());
        }

        if (!Pattern.matches(modifier.getUrlExpression(),
                (contentResult.getUri() != null ? contentResult.getUri() : uri).toString()))
            throw new Exception("The given testing result URL is not covered by the modifier expression");

        InputStream inputStream = contentResult.getInputStream();

        try {
            if (contentResult.getMediaType().startsWith("text/") || contentResult.getMediaType().contains("/xhtml")
                    || contentResult.getMediaType().contains("/xml")) {
                Reader reader;

                if (encodingOverride != null)
                    reader = new InputStreamReader(inputStream, encodingOverride);
                else
                    reader = new XmlHtmlReader(inputStream, contentResult.getContentType(), true);

                try {
                    for (ModifierFilter filter : modifier.getFilters()) {
                        StringBuffer current = new StringBuffer(), updated = new StringBuffer();

                        reader = readerToBuffer(current, reader, false);
                        reader = readerToBuffer(updated, modifierFilterProcessor.process(filter, reader), false);

                        result.put("Filter_" + filter.getId(), !current.toString().equals(updated.toString()));
                    }

                    for (ModifierScript script : modifier.getScripts()) {
                        StringBuffer current = new StringBuffer(), updated = new StringBuffer();

                        reader = readerToBuffer(current, reader, false);

                        reader = readerToBuffer(updated,
                                modifierScriptProcessor.process(script, extractAndClose(reader),
                                        contentResult.getUri() != null ? contentResult.getUri() : uri, settings,
                                        false),
                                false);

                        result.put("Script_" + script.getId(), !current.toString().equals(updated.toString()));
                    }
                } catch (IOException e) {
                    logger.error("Could not test the given result: " + e.getMessage(), e);
                } catch (ScriptException e) {
                    logger.error("Could not test the given result: " + e.getMessage(), e);
                }

                reader.close();
            } else
                throw new Exception("The given test feed URL does not contain appropriate content for testing ("
                        + contentResult.getMediaType().toString() + ") - must be text or XML-based");
        } catch (IOException e) {
            logger.error("Could not test the given result: " + e.getMessage(), e);
        }

        return result;
    }

    /**
     * Retrieve the result content for the given URI.
     *
     * @param encodingOverride
     * @param resultHeaders
     * @param userAgent
     * @return Content
     */
    public Content getContent(final URI resultUri, final String encodingOverride,
            final Map<String, String> resultHeaders, final String userAgent) {
        URI uriAfterRedirects = null;

        // Retrieve the content

        Header contentType = null;

        try {
            InputStream inputStream;

            // Local file streams can only be read if the parent scheme is also local

            if (!resultUri.getScheme().equalsIgnoreCase("file")) {
                HttpGet method = new HttpGet(resultUri);

                if (resultHeaders != null)
                    for (Entry<String, String> resultHeader : resultHeaders.entrySet())
                        method.setHeader(new BasicHeader(resultHeader.getKey(), resultHeader.getValue()));
                if (userAgent != null)
                    method.setHeader(CoreProtocolPNames.USER_AGENT, userAgent);

                HttpContext context = new BasicHttpContext();

                SizeRestrictedHttpResponse response = httpClient.execute(method,
                        new SizeRestrictedResponseHandler(maximumContentLength, resultUri), context);

                if (response != null) {
                    HttpUriRequest currentRequest = (HttpUriRequest) context
                            .getAttribute(ExecutionContext.HTTP_REQUEST);
                    HttpHost currentHost = (HttpHost) context.getAttribute(ExecutionContext.HTTP_TARGET_HOST);

                    try {
                        uriAfterRedirects = new URI(currentHost.toURI()).resolve(currentRequest.getURI());
                    } catch (URISyntaxException e) {
                        logger.error(String.format("Final URI '%s' is mysteriously invalid", currentHost.toURI()),
                                e);
                    }

                    inputStream = new ByteArrayInputStream(response.getResponse());
                    contentType = response.getContentType();
                } else
                    return null;
            } else
                inputStream = new FileInputStream(resultUri.getPath());

            // Convert the stream to a reset-able one

            ByteArrayOutputStream outputStream = new ByteArrayOutputStream();

            IOUtils.copy(inputStream, outputStream);

            inputStream.close();
            inputStream = new ByteArrayInputStream(outputStream.toByteArray());

            outputStream.close();

            // Now determine the content type and create a reader in case of structured content

            Metadata metadata = new Metadata();

            if (encodingOverride != null && contentType != null && StringUtils.hasText(contentType.getValue())) {
                MediaType type = MediaType.parse(contentType.getValue());

                metadata.add(HttpHeaders.CONTENT_TYPE,
                        type.getType() + "/" + type.getSubtype() + "; charset=" + encodingOverride);
            } else if (contentType != null && StringUtils.hasText(contentType.getValue()))
                metadata.add(HttpHeaders.CONTENT_TYPE, contentType.getValue());
            else if (encodingOverride != null)
                metadata.add(HttpHeaders.CONTENT_ENCODING, encodingOverride);

            MediaType mediaType = autoDetectParser.getDetector().detect(inputStream, metadata);

            return new Content(new ByteArrayInputStream(outputStream.toByteArray()),
                    mediaType.getBaseType() + "/" + mediaType.getSubtype(),
                    contentType != null ? contentType.getValue() : null,
                    uriAfterRedirects != null ? uriAfterRedirects : resultUri);
        } catch (IOException e) {
            logger.error("Could not retrieve the given URL", e);

            return null;
        }
    }

    /**
     * Apply the given modifiers, passing in the relevant class-bound arguments.
     * 
     * @param reader
     * @param uri
     * @param userAgent
     * @param resultParameters
     * @param resultHeaders
     * @return Reader
     * @throws IOException
     * @throws ScriptException
     */
    public Reader applyModifiers(Reader reader, URI uri, String userAgent, Map<String, String> resultParameters,
            Map<String, String> resultHeaders) throws IOException, ScriptException {
        WebResolverSettings settings = new WebResolverSettings();
        settings.setMaximumContentLength(maximumContentLength);
        settings.setUserAgent(userAgent);
        settings.setResultParameters(resultParameters);
        settings.setResultHeaders(resultHeaders);

        Reader result = reader;

        for (Modifier modifier : modifierCache.getResultModifiersByUrlMatch(uri.toString())) {
            for (ModifierFilter filter : modifier.getFilters())
                result = modifierFilterProcessor.process(filter, result);
            for (ModifierScript script : modifier.getScripts())
                result = modifierScriptProcessor.process(script, extractAndClose(result), uri, settings, true);
        }

        return result;
    }

    /**
     * Extract a String and close quietly.
     *
     * @param reader
     * @return String
     * @throws IOException
     */
    private String extractAndClose(final Reader reader) throws IOException {
        try {
            return IOUtils.toString(reader);
        } finally {
            IOUtils.closeQuietly(reader);
        }
    }

    /**
     * Intermediate content model.
     *
     * @author Jasper van Veghel <jasper@seajas.com>
     */
    public static class Content {
        /**
         * The content's input as a stream.
         */
        private InputStream inputStream;

        /**
         * The media type (sans parameters).
         */
        private String mediaType;

        /**
         * The "Content-Type" header.
         */
        private String contentType;

        /**
         * The URI after redirect processing.
         */
        private URI uri;

        /**
         * Default constructor.
         *
         * @param inputStream
         * @param mediaType
         * @param contentType
         * @param uri
         */
        public Content(final InputStream inputStream, final String mediaType, final String contentType,
                final URI uri) {
            this.inputStream = inputStream;
            this.mediaType = mediaType;
            this.contentType = contentType;
            this.uri = uri;
        }

        /**
         * Retrieve the inputStream.
         *
         * @return InputStream
         */
        public InputStream getInputStream() {
            return inputStream;
        }

        /**
         * Retrieve the mediaType.
         *
         * @return String
         */
        public String getMediaType() {
            return mediaType;
        }

        /**
         * Retrieve the contentType.
         *
         * @return String
         */
        public String getContentType() {
            return contentType;
        }

        /**
         * Retrieve the uri.
         *
         * @return URI
         */
        public URI getUri() {
            return uri;
        }
    }
}